In [1]:
from typing import Tuple, List, Dict, Any

In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, Imputer, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, KFold
from sklearn.metrics import mean_squared_error, make_scorer
from catboost import CatBoostRegressor, Pool
import joblib
import mlflow
from meteocalc import feels_like, heat_index, wind_chill, Temp

pd.options.display.max_columns = None
CURRENT_EXPERIMENT_NAME = 'catboost'

# Utilities

In [87]:
def filter_by(df: pd.DataFrame, **kwargs) -> pd.DataFrame:
    df_out = df
    for key, value in kwargs.items():
        if type(value) is list:
            df_out = df_out[df_out[key].isin(value)]
        else:
            df_out = df_out[df_out[key] == value]
    return df_out


def missing_rate(df: pd.DataFrame) -> pd.Series:
    return df.isnull().sum() / len(df)


def reduce_mem_usage(df: pd.DataFrame, verbose: bool = True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / (1024 ** 2)    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
            end_mem, 100 * (start_mem - end_mem) / start_mem)
        )
        
    return df


def rmse(y_true, y_pred) -> float:
    return np.sqrt(mean_squared_error(y_true, y_pred))


rmse_score = make_scorer(rmse, greater_is_better=False)


def add_key_prefix(d: Dict, prefix = 'best_') -> Dict:
    return {prefix + key: value for key, value in d.items()}


def df_from_cv_results(d: Dict):
    df = pd.DataFrame(d)
    score_columns = ['mean_test_score', 'mean_train_score']
    param_columns = [c for c in df.columns if c.startswith('param_')]
    return pd.concat([
        -df.loc[:, score_columns],
        df.loc[:, param_columns],
    ], axis=1).sort_values(by='mean_test_score')


def sample(*args, frac: float = 0.01) -> np.ndarray:
    n_rows = args[0].shape[0]
    random_index = np.random.choice(n_rows, int(n_rows * frac), replace=False)
    gen = (
        a[random_index] for a in args
    )
    if len(args) == 1:
        return next(gen)
    else:
        return gen

    
class BaseTransformer(BaseEstimator, TransformerMixin):
    
    def fit(self, x: pd.DataFrame, y = None):
        return self
    
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        return x


class ColumnTransformer(BaseTransformer):
    
    def __init__(self, defs: Dict[str, BaseTransformer]):
        self.defs = defs
    
    def fit(self, x: pd.DataFrame, y: np.ndarray = None):
        for col, transformer in self.defs.items():
            transformer.fit(x[col], y)
        return self
        
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        xp = x.copy()
        for col, transformer in self.defs.items():
            xp[col] = transformer.transform(x[col])
        return xp
    
    def fit_transform(self, x: pd.DataFrame, y: np.ndarray = None) -> pd.DataFrame:
        xp = x.copy()
        for col, transformer in self.defs.items():
            if hasattr(transformer, 'fit_transform'):
                xp[col] = transformer.fit_transform(x[col], y)
            else:
                xp[col] = transformer.fit(x[col], y).transform(x[col])
        return xp


class WrappedLabelEncoder(BaseTransformer):
    
    def __init__(self):
        self.le = LabelEncoder()
    
    def fit(self, x, y = None):
        self.le.fit(x)
        return self

    def transform(self, x):
        return self.le.transform(x)


def wind_chill_safely(t, w):
    try:
        return wind_chill(t, w)
    except ValueError:
        return Temp(10, unit='C')

# Load CSV

In [142]:
train = pd.read_csv('data/train.csv', parse_dates=['timestamp']).pipe(reduce_mem_usage)
building_metadata = pd.read_csv('data/building_metadata.csv')
weather_train = pd.read_csv('data/weather_train.csv', parse_dates=['timestamp'])

test = pd.read_csv('data/test.csv', parse_dates=['timestamp']).pipe(reduce_mem_usage)
weather_test = pd.read_csv('data/weather_test.csv', parse_dates=['timestamp'])

Mem. usage decreased to 289.19 Mb (53.1% reduction)
Mem. usage decreased to 596.49 Mb (53.1% reduction)


# Weather Transformer

In [93]:
class WeatherImputer(BaseTransformer):
    
    def transform(self, w: pd.DataFrame) -> pd.DataFrame:
        
        # add missing datetime
        dt_min, dt_max = w['timestamp'].min(), w['timestamp'].max()
        empty_df = pd.DataFrame({'timestamp': pd.date_range(start=dt_min, end=dt_max, freq='H')})
        w_out = pd.concat([
            ws.merge(
                empty_df, on='timestamp', how='outer'
            ).sort_values(
                by='timestamp'
            ).assign(
                site_id=site_id
            ) for site_id, ws in w.groupby('site_id')
        ], ignore_index=True)
        
        w_out['month'] = w_out['timestamp'].dt.month
        w_out['time_period'] = pd.cut(
            w_out['timestamp'].dt.hour,
            bins=[0, 3, 6, 9, 12, 15, 18, 21, 25],
            right=False, labels=False,
        )
        w_out = w_out.set_index(['site_id', 'month', 'time_period'])
        w_updater = w_out.groupby(['site_id', 'month', 'time_period']).mean().fillna(method='bfill').fillna(method='ffill')
        w_out.update(w_updater, overwrite=False)  # destroying method
        w_out = w_out.reset_index().drop(columns=['month', 'time_period'])

        # float -> uint
        w_out['site_id'] = w_out['site_id'].astype(np.uint8)

        return w_out


class WeatherEngineerer(BaseTransformer):
    
    @staticmethod
    def shift_by(wdf: pd.DataFrame, n: int) -> pd.DataFrame:
        method = 'bfill' if n > 0 else 'ffill'
        return pd.concat([
            ws.iloc[:, [2, 4, 8]].shift(n).fillna(method=method) for _, ws in wdf.groupby('site_id')
        ], axis=0)
    
    def weather_weighted_average(self, w: pd.DataFrame, hours: int = 5) -> pd.DataFrame:
        ahours = abs(hours)
        sign = int(hours / ahours)
        w_weighted_average = sum(
            [self.shift_by(w, (i+1)*sign) * (ahours-i) for i in range(ahours)]
        ) / (np.arange(ahours) + 1).sum()

        w_weighted_average.columns = ['{0}_wa{1}'.format(c, hours) for c in w_weighted_average.columns]

        return pd.concat([w, w_weighted_average], axis=1)
    
    @staticmethod
    def dwdt(df: pd.DataFrame, base_col: str) -> pd.DataFrame:
        df_out = df.copy()
        df_out[base_col + '_dt_wa1'] = df[base_col] - df[base_col + '_wa1']
        df_out[base_col + '_dt_wa-1'] = df[base_col] - df[base_col + '_wa-1']
        df_out[base_col + '_dt_wa5'] = df[base_col] - df[base_col + '_wa5']
        df_out[base_col + '_dt_wa-5'] = df[base_col] - df[base_col + '_wa-5']
        return df_out
    
    @staticmethod
    def wet(df: pd.DataFrame, suffix: str = '') -> pd.DataFrame:
        df_out = df.copy()
        df_out['wet' + suffix] = df['air_temperature' + suffix] - df['dew_temperature' + suffix]
        return df_out
        
    @staticmethod
    def meteocalc(w_in: pd.DataFrame, suffix: str = '') -> pd.DataFrame:

        w = w_in.assign(**{
            'relative_humidity' + suffix: 100 * (
                np.exp(
                    (17.625 * w_in['dew_temperature' + suffix]) / (243.04 + w_in['dew_temperature' + suffix])
                ) / np.exp(
                    (17.625 * w_in['air_temperature' + suffix]) / (243.04 + w_in['air_temperature' + suffix])
                )
            )
        })
        return w.assign(**{
            'feels_like' + suffix: w.apply(lambda row: feels_like(
                Temp(row['air_temperature' + suffix], unit='C'),
                row['relative_humidity' + suffix],
                row['wind_speed' + suffix]
            ).c, axis=1)
        }).assign(**{
            'heat_index' + suffix: w.apply(lambda row: heat_index(
                Temp(row['air_temperature' + suffix], unit='C'),
                row['relative_humidity' + suffix]
            ).c, axis=1)
        }).assign(**{
            'wind_chill' + suffix: w.apply(lambda row: wind_chill_safely(
                Temp(row['air_temperature' + suffix], unit='C'),
                row['wind_speed' + suffix]
            ).c, axis=1)
        })
    
    def transform(self, w_in: pd.DataFrame) -> pd.DataFrame:
        w = w_in.pipe(
            self.weather_weighted_average, hours=1
        ).pipe(
            self.weather_weighted_average, hours=-1
        ).pipe(
            self.weather_weighted_average
        ).pipe(
            self.weather_weighted_average, hours=-5
        )

        w = w.pipe(
            self.dwdt, base_col='air_temperature'
        ).pipe(
            self.dwdt, base_col='dew_temperature'
        ).pipe(
            self.dwdt, base_col='wind_speed'
        ).pipe(
            self.wet
        ).pipe(
            self.wet, suffix='_wa1'
        ).pipe(
            self.wet, suffix='_wa-1'
        ).pipe(
            self.wet, suffix='_wa5'
        ).pipe(
            self.wet, suffix='_wa-5'
        ).pipe(
            self.meteocalc
        ).pipe(
            self.meteocalc, suffix='_wa1'
        ).pipe(
            self.meteocalc, suffix='_wa-1'
        ).pipe(
            self.meteocalc, suffix='_wa5'
        ).pipe(
            self.meteocalc, suffix='_wa-5'
        )

        return w



class WindDirectionEncoder(BaseTransformer):
    
    @staticmethod
    def _from_degree(degree: int) -> int:
        val = int((degree / 22.5) + 0.5)
        arr = [i for i in range(0,16)]
        return arr[(val % 16)]
    
    def transform(self, x: pd.Series) -> pd.Series:
        return x.apply(self._from_degree)


class WindSpeedEncoder(BaseTransformer):
    
    def transform(self, x: pd.Series) -> pd.Series:
        return pd.cut(
            x,
            bins=[0, 0.3, 1.6, 3.4, 5.5, 8, 10.8, 13.9, 17.2, 20.8, 24.5, 28.5, 33, 1000],
            right=False, labels=False,
        )

    
weather_pipeline = Pipeline(steps=[
    ('impute_missing_value', WeatherImputer()),
    ('feature_engineering', WeatherEngineerer()),
    ('label_encode', ColumnTransformer({
        'wind_direction': WindDirectionEncoder(),
        'wind_speed': WindSpeedEncoder(),
        'wind_speed_wa1': WindSpeedEncoder(),
        'wind_speed_wa-1': WindSpeedEncoder(),    
        'wind_speed_wa5': WindSpeedEncoder(),
        'wind_speed_wa-5': WindSpeedEncoder(),    
    }))
])

# Building Metadata Transformer

In [109]:
class BuildingMetadataEngineerer(BaseTransformer):
    
    def transform(self, bm_in: pd.DataFrame) -> pd.DataFrame:
        bm = bm_in.copy()
        bm['log_square_feet'] = np.log1p(bm['square_feet'])
        bm['square_feet_per_floor'] = bm['square_feet'] / bm['floor_count']
        bm['log_square_feet_per_floor'] = bm['log_square_feet'] / bm['floor_count']
        bm['building_age'] = 2019 - bm['year_built']
        bm['square_feet_per_age'] = bm['square_feet'] / bm['building_age']
        bm['log_square_feet_per_age'] = bm['log_square_feet'] / bm['building_age']
        return bm


class BuildingMetadataImputer(BaseTransformer):
    
    def transform(self, bm: pd.DataFrame) -> pd.DataFrame:
        return bm.fillna(-999)


building_metadata_pipeline = Pipeline(steps=[
    ('feature_engineering', BuildingMetadataEngineerer()),
    ('impute_missing_value', BuildingMetadataImputer()),
])

# Transformer

In [110]:
class BuildingMetaJoiner(BaseTransformer):
    
    def __init__(self, bm: pd.DataFrame = None):
        self.bm = bm
    
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        if self.bm is None:
            return x
        else:
            return x.merge(
                self.bm,
                on='building_id',
                how='left',
            )

    
class WeatherJoiner(BaseTransformer):
    
    def __init__(self, w: pd.DataFrame = None):
        self.w = w
    
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        if self.w is None:
            return x
        else:
            return x.merge(
                self.w,
                on=['site_id', 'timestamp'],
                how='left',
            )


class DatetimeFeatureEngineerer(BaseTransformer):
    
    def __init__(self, col: str = 'timestamp'):
        self.col = col
    
    def transform(self, x: pd.DataFrame) -> pd.DataFrame:
        xp = x.copy()
        ts = x[self.col]
        xp['month'] = ts.dt.month.astype(np.int8)
        xp['week'] = ts.dt.week.astype(np.int8)
        xp['day_of_week'] = ts.dt.weekday.astype(np.int8)
        xp['time_period'] = pd.cut(
            ts.dt.hour,
            bins=[0, 3, 6, 9, 12, 15, 18, 21, 25],
            right=False, labels=False,
        )
        
        holidays = [
            '2016-01-01', '2016-01-18', '2016-02-15', '2016-05-30', '2016-07-04',
            '2016-09-05', '2016-10-10', '2016-11-11', '2016-11-24', '2016-12-26',
            '2017-01-01', '2017-01-16', '2017-02-20', '2017-05-29', '2017-07-04',
            '2017-09-04', '2017-10-09', '2017-11-10', '2017-11-23', '2017-12-25',
            '2018-01-01', '2018-01-15', '2018-02-19', '2018-05-28', '2018-07-04',
            '2018-09-03', '2018-10-08', '2018-11-12', '2018-11-22', '2018-12-25',
            '2019-01-01'
        ]
        xp['is_holiday'] = (ts.dt.date.astype('str').isin(holidays)).astype(np.int8)
        return xp


class TargetEncoder(BaseTransformer):
    
    def __init__(self, cv: int = 5, smoothing: int = 1):
        self.agg = None
        self.cv = cv
        self.smoothing = 1
    
    def transform(self, x: pd.Series):        
        if self.agg is None:
            raise ValueError('you shold fit() before predict()')
        encoded = pd.merge(x, self.agg, left_on=x.name, right_index=True, how='left')
        encoded = encoded.fillna(encoded.mean())
        xp = encoded['y']
        xp.name = x.name
        return xp
    
    def fit_transform(self, x: pd.Series, y: np.ndarray = None) -> pd.Series:
        df = pd.DataFrame({'x': x, 'y': y})
        self.agg = df.groupby('x').mean()
        fold = KFold(n_splits=self.cv, shuffle=True)
        xp = x.copy()
        for idx_train, idx_test in fold.split(x):
            df_train = df.loc[idx_train, :]
            df_test = df.loc[idx_test, :]
            agg_train = df_train.groupby('x').mean()
            encoded = pd.merge(df_test, agg_train, left_on='x', right_index=True, how='left', suffixes=('', '_mean'))['y_mean']
            encoded = encoded.fillna(encoded.mean())
            xp[encoded.index] = encoded
        return xp


class ColumnDropper(BaseTransformer):
    
    def __init__(self, cols: List[str]):
        self.cols = cols
    
    def transform(self, x: pd.DataFrame, y = None) -> pd.DataFrame:
        return x.drop(columns=self.cols)


class ArrayTransformer(BaseTransformer):
    
    def transform(self, x: pd.DataFrame, y = None) -> np.ndarray:
        return x.values

# Pipeline

In [132]:
building_metadata_pipeline_out = building_metadata_pipeline.fit_transform(
    building_metadata
)
weather_pipeline_out = weather_pipeline.fit_transform(
    pd.concat([weather_train, weather_test], axis=0, ignore_index=True)
)


def pipeline_factory() -> Pipeline:
    return Pipeline(steps=[

        # join
        ('join_building_meta', BuildingMetaJoiner(
            building_metadata_pipeline_out
        )),
        ('join_weather', WeatherJoiner(
            weather_pipeline_out
        )),

        # feature engineering
        ('feature_engineering_from_datetime', DatetimeFeatureEngineerer()),
#         ('target_encode', ColumnTransformer({
#             'primary_use': TargetEncoder(),
#             'meter': TargetEncoder(),
#             'cloud_coverage': TargetEncoder(),
#             'time_period': TargetEncoder(),
#             'wind_direction': TargetEncoder(),
#             'wind_speed': TargetEncoder(),
#             'wind_speed_wa1': TargetEncoder(),
#             'wind_speed_wa-1': TargetEncoder(),
#             'wind_speed_wa5': TargetEncoder(),
#             'wind_speed_wa-5': TargetEncoder(),
#         })),

        # drop columns
        ('drop_columns', ColumnDropper([
            'building_id', 'timestamp', 'site_id', 'precip_depth_1_hr',
        ])),

        # pd.DataFrame -> np.ndarray
#         ('df_to_array', ArrayTransformer()),

        # regressor
#         ('regressor', RandomForestRegressor()),
#         ('regressor', CatBoostRegressor()),

    ])

In [134]:
tmp = pipeline_factory().fit_transform(
    train.sample(frac=0.001).drop(columns='meter_reading')
)
tmp.head()

Unnamed: 0,meter,primary_use,square_feet,year_built,floor_count,log_square_feet,square_feet_per_floor,log_square_feet_per_floor,building_age,square_feet_per_age,log_square_feet_per_age,air_temperature,cloud_coverage,dew_temperature,sea_level_pressure,wind_direction,wind_speed,air_temperature_wa1,dew_temperature_wa1,wind_speed_wa1,air_temperature_wa-1,dew_temperature_wa-1,wind_speed_wa-1,air_temperature_wa5,dew_temperature_wa5,wind_speed_wa5,air_temperature_wa-5,dew_temperature_wa-5,wind_speed_wa-5,air_temperature_dt_wa1,air_temperature_dt_wa-1,air_temperature_dt_wa5,air_temperature_dt_wa-5,dew_temperature_dt_wa1,dew_temperature_dt_wa-1,dew_temperature_dt_wa5,dew_temperature_dt_wa-5,wind_speed_dt_wa1,wind_speed_dt_wa-1,wind_speed_dt_wa5,wind_speed_dt_wa-5,wet,wet_wa1,wet_wa-1,wet_wa5,wet_wa-5,relative_humidity,feels_like,heat_index,wind_chill,relative_humidity_wa1,feels_like_wa1,heat_index_wa1,wind_chill_wa1,relative_humidity_wa-1,feels_like_wa-1,heat_index_wa-1,wind_chill_wa-1,relative_humidity_wa5,feels_like_wa5,heat_index_wa5,wind_chill_wa5,relative_humidity_wa-5,feels_like_wa-5,heat_index_wa-5,wind_chill_wa-5,month,week,day_of_week,time_period,is_holiday
0,1,Education,198488,-999.0,-999.0,12.198489,-999.0,-999.0,-999.0,-999.0,-999.0,20.0,0.0,12.8,1016.1,7,3,21.1,12.2,3,18.3,12.8,2,22.3,11.586667,2,17.626667,13.373333,2,-1.1,1.7,-2.3,2.373333,0.6,0.0,1.213333,-0.573333,0.5,1.5,1.6,2.166667,7.2,8.9,5.5,10.713333,4.253333,63.235822,20.0,19.706713,10.0,56.808982,21.1,20.748901,10.0,70.302206,18.3,18.021224,10.0,50.700087,22.3,21.909391,10.0,76.140477,17.626667,17.433001,10.0,11,47,1,0,0
1,0,Education,214505,1990.0,-999.0,12.276093,-999.0,-999.0,29.0,7396.724138,0.423314,15.6,0.0,13.3,1025.8,4,2,13.3,12.2,2,20.0,15.0,2,13.42,12.2,2,23.04,16.006667,2,2.3,-4.4,2.18,-7.44,1.1,-1.7,1.1,-2.706667,0.0,0.0,-0.166667,-0.433333,2.3,1.1,5.0,1.22,7.033333,86.191268,15.6,15.466105,10.0,93.051517,13.3,13.115234,10.0,72.938767,20.0,19.960068,10.0,92.326782,13.42,13.22831,10.0,64.590876,23.04,23.086095,10.0,8,34,2,4,0
2,0,Education,47200,1935.0,-999.0,10.76217,-999.0,-999.0,84.0,561.904762,0.128121,3.9,4.0,-3.9,1015.2,9,3,5.0,-4.4,3,5.0,-4.4,2,4.646667,-4.213333,3,3.72,-3.486667,2,-1.1,-1.1,-0.746667,0.18,0.5,0.5,0.313333,-0.413333,0.5,1.5,-0.9,1.266667,7.8,9.4,9.4,8.86,7.206667,56.790943,2.273525,1.82843,2.273525,50.648938,3.807809,2.878056,3.807809,50.648938,5.0,2.878056,10.0,52.645658,2.722191,2.541526,2.722191,59.319445,3.72,1.696452,10.0,1,4,6,2,0
3,2,Education,258491,-999.0,-999.0,12.46262,-999.0,-999.0,-999.0,-999.0,-999.0,11.1,0.522388,8.9,1017.8,5,2,11.1,8.3,2,11.1,9.4,2,13.146667,10.513333,2,10.766667,9.306667,2,0.0,0.0,-2.046667,0.333333,0.6,-0.5,-1.613333,-0.406667,0.5,0.5,0.436039,0.433333,2.2,2.8,1.7,2.633333,1.46,86.313549,11.1,10.519298,10.0,82.880385,11.1,10.429655,10.0,89.269749,11.1,10.596488,10.0,84.058699,13.146667,12.711755,10.0,90.696896,10.766667,10.267086,10.0,12,48,5,1,0
4,0,Education,51020,1924.0,5.0,10.839993,10204.0,2.167999,95.0,537.052632,0.114105,13.3,2.0,10.0,1015.8,13,3,13.9,10.0,3,13.3,10.0,2,15.013333,10.5,3,13.1,9.96,2,-0.6,0.0,-1.713333,0.2,0.0,0.0,-0.5,0.04,0.0,0.5,-0.506667,1.126667,3.3,3.9,3.3,4.513333,3.14,80.418244,13.3,12.785365,10.0,77.340598,13.9,13.365005,10.0,80.418244,13.3,12.785365,10.0,74.416405,15.013333,14.513317,10.0,81.256645,13.1,12.587257,10.0,6,26,0,3,0


In [135]:
cat_features = [
    'meter', 'primary_use', 'wind_direction', 'wind_speed',
    'wind_speed_wa1', 'wind_speed_wa-1', 'wind_speed_wa5', 'wind_speed_wa-5',
    'day_of_week', 'time_period', 'is_holiday',
]

# CatBoost CV

In [138]:
def cv_catboost(p: Pipeline, df: pd.DataFrame, **params):
    
    x = p.fit_transform(df.drop(columns='meter_reading'))
    y = np.log1p(df['meter_reading'].values)
    
    models = []

    mlflow.set_experiment(CURRENT_EXPERIMENT_NAME)
    with mlflow.start_run():
        
        mlflow.log_params(params)
        
        for i_train, i_val in KFold(n_splits=3, shuffle=True).split(x):
            
            x_train, x_val = x.loc[i_train, :], x.loc[i_val, :]
            y_train, y_val = y[i_train], y[i_val]
            
            model = CatBoostRegressor(**params)
        
            model.fit(
                x_train, y_train,
                cat_features=cat_features,
                eval_set=(x_val, y_val),
                logging_level='Verbose',
            )
            
            models.append(model)
        
        mlflow.log_metrics(dict(
            rmse_train=np.mean([m.best_score_['learn']['RMSE'] for m in models]),
            rmse_val=np.mean([m.best_score_['validation']['RMSE'] for m in models]),
        ))
        eval_result = pd.DataFrame({
            'RMSE_train': models[0].evals_result_['learn']['RMSE'],
            'RMSE_eval': models[0].evals_result_['validation']['RMSE']
        })
        eval_result.to_csv('out/eval_result.csv', index=False)
        mlflow.log_artifact('out/eval_result.csv')
        for i, m in enumerate(models):            
            joblib.dump(m, 'out/model{0}.joblib'.format(i))
            mlflow.log_artifact('out/model{0}.joblib'.format(i))
        return models

In [None]:
cv_catboost(
    pipeline_factory(),
    train.sample(frac=0.001),
    n_estimators=10,
    max_depth=10,
    learning_rate=0.1,
    early_stopping_rounds=10,
)

# CatBoost Prediction

In [147]:
def load_model_catboost(run_id: str = None):
    if run_id is None:
        model_paths = ['out/model{0}.joblib'.format(i) for i in range(3)]
    else:
        c = mlflow.tracking.MlflowClient()
        model_paths = [c.download_artifacts(run_id, 'model{0}.joblib'.format(i)) for i in range(3)]

    return [joblib.load(p) for p in model_paths]


def predict_catboost(df: pd.DataFrame, p: Pipeline, models) -> pd.DataFrame:
    x = df.iloc[:, 1:]
    y = np.mean([
        np.expm1(
            m.predict(p.transform(x))
        ) for m in models
    ], axis=0)
    y = np.clip(y, a_min=0, a_max=None)
    return pd.DataFrame({
        'row_id': df.iloc[:, 0],
        'meter_reading': y,
    })[['row_id', 'meter_reading']]

In [None]:
ms = load_model_catboost()
predict_catboost(test, pipeline_factory(), ms).to_csv('submission.csv', index=False)

# Random Forest CV

In [None]:
def cv(pipeline: Pipeline, df: pd.DataFrame, n_jobs: int = -1, **params) -> Tuple[float, float]:
    
    x = df.drop(columns='meter_reading')
    y = np.log1p(df['meter_reading'].values)

    default_params = dict(
        n_estimators=10,
        max_depth=None,
        max_features='auto',
        min_samples_leaf=1,
    )
    merged_params = {**default_params, **params}

    pipeline_params = {**merged_params, 'n_jobs': n_jobs}
    pipeline_params = add_key_prefix(pipeline_params, 'regressor__')
    pipeline.set_params(**pipeline_params)
    
    mlflow.set_experiment(CURRENT_EXPERIMENT_NAME)
    with mlflow.start_run():
        
        mlflow.log_params(merged_params)
        scores = cross_validate(
            pipeline, x, y,
            cv=3,
            scoring=rmse_score,
            return_train_score=True,
            verbose=2,
        )
        
        rmse_val = - np.mean(scores['test_score'])
        rmse_train = - np.mean(scores['train_score'])
        mlflow.log_metrics(dict(
            rmse_val=rmse_val,
            rmse_train=rmse_train,
        ))
        return rmse_val, rmse_train

In [None]:
cv(
    pipeline_factory(),
    train,
    n_jobs=-1,
    n_estimators=64,
    min_samples_leaf=0.00001,
)

# Random Forest One shot

In [14]:
def oneshot(pipeline: Pipeline, df: pd.DataFrame, **params):
    
    x = df.drop(columns='meter_reading')
    y = np.log1p(df['meter_reading'].values)

    default_params = dict(
        n_estimators=10,
        max_depth=None,
        max_features='auto',
        min_samples_leaf=1,
    )
    merged_params = {**default_params, **params}

    pipeline_params = {**merged_params, 'n_jobs': -1, 'verbose': 2}
    pipeline_params = add_key_prefix(pipeline_params, 'regressor__')
    pipeline.set_params(**pipeline_params)

    mlflow.set_experiment(CURRENT_EXPERIMENT_NAME)
    with mlflow.start_run():
        
        mlflow.log_params(merged_params)

        pipeline.fit(x, y)
        joblib.dump(pipeline, 'out/pipeline.joblib', compress=1)
        
        score = rmse(y, pipeline.predict(x))
        
        mlflow.log_metrics(dict(rmse_train=score))
        mlflow.log_artifact('out/pipeline.joblib')
        
        return pipeline

In [None]:
p = oneshot(pipeline_factory(), train, n_estimators=64, min_samples_leaf=0.00001)

# Random Forest Grid Search

In [None]:
def grid_search(pipeline: Pipeline, df: pd.DataFrame, n_jobs: int = -1, **param_grid):
            
    x = df.drop(columns='meter_reading')
    y = np.log1p(df['meter_reading'].values)

    default_param_grid = dict(
        n_estimators=[80],
        max_depth=[None],
        max_features=['auto'],
        min_samples_leaf=[0.00001],
    )
    merged_param_grid = {**default_param_grid, **param_grid}
    pipeline_param_grid = add_key_prefix(merged_param_grid, 'regressor__')
    
    pipeline.set_params(regressor__n_jobs=n_jobs)
    
    mlflow.set_experiment(CURRENT_EXPERIMENT_NAME)
    with mlflow.start_run():
        
        mlflow.log_params(merged_param_grid)
        
        regressor = GridSearchCV(
            pipeline,
            param_grid=pipeline_param_grid,
            cv=3,
            scoring=rmse_score,
            verbose=2,
            refit=True,
        )

        regressor.fit(x, y)
        
        best_model = regressor.best_estimator_
        best_param = add_key_prefix(regressor.best_params_)
        best_rmse = - regressor.best_score_
        cv_results = df_from_cv_results(regressor.cv_results_)

        joblib.dump(best_model, 'out/pipeline.joblib')
        cv_results.to_csv('out/cv_results.csv', index=False)
        
        mlflow.log_params(best_param)
        mlflow.log_metrics(dict(
            rmse=best_rmse,
        ))
        mlflow.log_artifact('./out/pipeline.joblib')
        mlflow.log_artifact('./out/cv_results.csv')
        mlflow.end_run()
        return cv_results

In [None]:
grid_search(
    pipeline_factory(),
    train,
    n_jobs=-1,
    n_estimators=[64, 80, 96],
    max_features=['auto', 'sqrt'],
)

# Random Forest Prediction

In [None]:
def load_model(run_id: str = None):
    if run_id is None:
        model_path = 'out/pipeline.joblib'
    else:
        mlflow_client = mlflow.tracking.MlflowClient()
        model_path = mlflow_client.download_artifacts(run_id, 'pipeline.joblib')

    return joblib.load(model_path)


def predict(df: pd.DataFrame, pipeline: Pipeline) -> pd.DataFrame:
    x = df.iloc[:, 1:]
    y_log1p = pipeline.predict(x)
    y = np.expm1(y_log1p)
    return pd.DataFrame({
        'row_id': df.iloc[:, 0],
        'meter_reading': y,
    })[['row_id', 'meter_reading']]

In [None]:
p = load_model()
predict(test, p).to_csv('submission.csv', index=False)

In [None]:
!kaggle competitions submit -c ashrae-energy-prediction -f submission.csv -m "weighted average"

In [None]:
import seaborn as sns

In [None]:
weather_train.head(1)

In [None]:
sns.pairplot(weather_train.iloc[:, 2:])

In [None]:
weather_train['precip_depth_1_hr'].value_counts()

In [None]:
weather_train.pipe(missing_rate)