In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import gc
from lightgbm import LGBMRegressor

Function to reduce memory usage https://www.kaggle.com/ihelon/simple-xgboost

Have a look at https://www.kaggle.com/nroman/4-kfolds-lightgbm with in particular reduce_mem_usage_sd and lightgbm 

In [2]:
# https://blog.csdn.net/appleyuchi/article/details/100251308
def memory_usage_mb(df, *args, **kwargs):
    """Dataframe memory usage in MB. """
    return df.memory_usage(*args, **kwargs).sum() / 1024**2
 

def reduce_memory_usage(df, deep=True, verbose=True, categories=True):
    # All types that we want to change for "lighter" ones.
    # int8 and float16 are not include because we cannot reduce
    # those data types.
    # float32 is not include because float16 has too low precision.
    numeric2reduce = ["int16", "int32", "int64", "float64"]
    start_mem = 0
    if verbose:
        start_mem = memory_usage_mb(df, deep=deep)
 
    for col, col_type in df.dtypes.iteritems():
        best_type = None
        if col_type == "object":
            df[col] = df[col].astype("category")
            best_type = "category"
        elif col_type in numeric2reduce:
            downcast = "integer" if "int" in str(col_type) else "float"
            df[col] = pd.to_numeric(df[col], downcast=downcast)
            best_type = df[col].dtype.name
        # Log the conversion performed.
        if verbose and best_type is not None and best_type != str(col_type):
            print(f"Column '{col}' converted from {col_type} to {best_type}")
 
    if verbose:
        end_mem = memory_usage_mb(df, deep=deep)
        diff_mem = start_mem - end_mem
        percent_mem = 100 * diff_mem / start_mem
        print(f"Memory usage decreased from"
              f" {start_mem:.2f}MB to {end_mem:.2f}MB"
              f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")
    return df

## 1/ Read the data for the training set

In [24]:
# read data files
def read_data(case='train'):
    try:
        print('read pickle')
        df = pd.read_pickle(case + '.pkl')
        print('read pickle successful')
    except FileNotFoundError:
        print('read data from source files')
        df = pd.read_csv('data/{}.csv'.format(case))
        df_meta = pd.read_csv('data/building_metadata.csv')
        df_wh = pd.read_csv('data/weather_{}.csv'.format(case))

        # merge datasets
        print('merge datasets')
        df = df.merge(df_meta, on='building_id', how='left').sort_values(['building_id', 'meter', 'timestamp'])
        del df_meta
        df = df.merge(df_wh, on=['site_id', 'timestamp'], how='left')
        del df_wh
        df['timestamp'] = pd.to_datetime(df.pop('timestamp'))
        print('reduce memory')
        df = reduce_memory_usage(df)
        df.to_pickle(case + '.pkl')
    return df
    
df_tr = read_data('train')
df_tr.head()
# beware to use gc to reduce the garbage collector error...

read pickle
read pickle successful


Unnamed: 0,building_id,meter,meter_reading,site_id,primary_use,square_feet,year_built,floor_count,air_temperature,cloud_coverage,dew_temperature,precip_depth_1_hr,sea_level_pressure,wind_direction,wind_speed,timestamp
0,0,0,0.0,0,Education,7432,2008.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016-01-01
1,1,0,0.0,0,Education,2720,2004.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016-01-01
2,2,0,0.0,0,Education,5376,1991.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016-01-01
3,3,0,0.0,0,Education,23685,2002.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016-01-01
4,4,0,0.0,0,Education,116607,1975.0,,25.0,6.0,20.0,,1019.700012,0.0,0.0,2016-01-01


## 2/ perform a feature engineering

Create features

In [4]:
#TODO, use the following: https://www.kaggle.com/fk0728/feature-engineering-with-sklearn-pipelines/data ! Go.
# replace below by an assiggn (for performances)
def create_features(df): 
    df['is_weekend'] = df['timestamp'].apply(lambda x: 1 if x.date().weekday() in (5, 6) else 0)
    df["weekend"] = df["timestamp"].dt.weekday.astype(np.uint8)
    df["year"] = df["timestamp"].dt.year.astype(np.uint16)
    df["month"] = df["timestamp"].dt.month.astype(np.uint8)
    df["day"] = df["timestamp"].dt.day.astype(np.uint8)
    df["hour"] = df["timestamp"].dt.hour.astype(np.uint8)
    df.drop('timestamp', axis=1, inplace=True)
    
    # apply labeling to 'primary_use'
    le_dict = {'Education': 0,
           'Office': 6,
           'Entertainment/public assembly': 1,
           'Lodging/residential': 4,
           'Public services': 9,
           'Healthcare': 3,
           'Other': 7,
           'Parking': 8,
           'Manufacturing/industrial': 5,
           'Food sales and service': 2,
           'Retail': 11,
           'Warehouse/storage': 15,
           'Services': 12,
           'Technology/science': 13,
           'Utility': 14,
           'Religious worship': 10}

    df['primary_use'] = df['primary_use'].map(le_dict)
    return df

df_tr = create_features(df_tr)

Evolution of meter reading with other continious variables

In [5]:
#TODO much more advanced feature engineering to perform !

Evolution of meter reading with categorial

# Build pipeline and first model 

In [6]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [7]:
#TODO use "primary_use" and other variables
categorials_to_label = ['weekend', 'month', 'day', 'meter', ]# add a "is_holiday"
continuous_variables = ['air_temperature', 'cloud_coverage', 'dew_temperature', 
                        # 'precip_depth_1_hr', 'sea_level_pressure',   #do not keep these
                       'floor_count', 'year_built', 'square_feet',
                       'primary_use']  # already put to numerical

In [8]:
train_sel = df_tr[categorials_to_label + continuous_variables]
target_train = np.log1p(df_tr['meter_reading'])
target_train = np.where(target_train < 0, 0, target_train)

In [9]:

# later, improve with imputation on numericals and categorials done by other variables (e.g )
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent', fill_value='missing'))])

In [10]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, continuous_variables),
        ('cat', categorical_transformer, categorials_to_label)])

# build grid search and define hyperparameters and train the model

In [11]:
param_grid = {
   # 'regressor__xgb__n_estimators': [100, 1000],
   # 'regressor__xgb__learning_rate': [0.05, 0.1]
    'lgbm__n_estimators': [100, 1000],
    'lgbm__learning_rate': [0.1],
    'lgbm__colsample_bytree': [0.9],
    'lgbm__subsample': [0.9],
    'lgbm__reg_alpha': [0.1],
    'lgbm__reg_lambda': [0.1],

}



In [12]:
regr = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lgbm', LGBMRegressor())])
             #         ('xgb', XGBRegressor())])
np.random.seed(47)
tscv = TimeSeriesSplit(n_splits=5)
cv = GridSearchCV(regr, param_grid, refit=True, n_jobs=4, verbose=50, cv=tscv, iid=True,
                          scoring=('neg_mean_squared_error'))  # refer to https://scikit-learn.org/stable/modules/model_evaluation.html (neg_mean_squared_error requires not to have negative errors...)
cv.fit(train_sel, target_train)

Fitting 5 folds for each of 2 candidates, totalling 10 fits
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done   1 tasks      | elapsed:  1.5min
[Parallel(n_jobs=4)]: Done   2 tasks      | elapsed:  2.6min
[Parallel(n_jobs=4)]: Done   3 tasks      | elapsed:  3.4min
[Parallel(n_jobs=4)]: Done   4 out of  10 | elapsed:  4.0min remaining:  5.9min
[Parallel(n_jobs=4)]: Done   5 out of  10 | elapsed:  6.4min remaining:  6.4min
[Parallel(n_jobs=4)]: Done   6 out of  10 | elapsed: 11.2min remaining:  7.5min
[Parallel(n_jobs=4)]: Done   7 out of  10 | elapsed: 16.1min remaining:  6.9min
[Parallel(n_jobs=4)]: Done   8 out of  10 | elapsed: 20.1min remaining:  5.0min
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 29.7min remaining:    0.0s
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 29.7min finished


GridSearchCV(cv=TimeSeriesSplit(max_train_size=None, n_splits=5),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('preprocessor',
                                        ColumnTransformer(n_jobs=None,
                                                          remainder='drop',
                                                          sparse_threshold=0.3,
                                                          transformer_weights=None,
                                                          transformers=[('num',
                                                                         Pipeline(memory=None,
                                                                                  steps=[('imputer',
                                                                                          SimpleImputer(add_indicator=False,
                                                                           

In [13]:
cv.cv_results_

{'mean_fit_time': array([156.15624223, 666.86137772]),
 'std_fit_time': array([ 68.15666893, 251.34468369]),
 'mean_score_time': array([ 35.66681623, 216.56990886]),
 'std_score_time': array([ 3.42536711, 36.49043323]),
 'param_lgbm__colsample_bytree': masked_array(data=[0.9, 0.9],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_lgbm__learning_rate': masked_array(data=[0.1, 0.1],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_lgbm__n_estimators': masked_array(data=[100, 1000],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_lgbm__reg_alpha': masked_array(data=[0.1, 0.1],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_lgbm__reg_lambda': masked_array(data=[0.1, 0.1],
              mask=[False, False],
        fill_value='?',
             dtype=object),
 'param_lgbm__subsample': masked_array(data=[0.9,

# Score the test set

In [25]:
df_test = read_data('test')
df_test = create_features(df_test)
test_sel = df_test[categorials_to_label + continuous_variables]

read pickle
read data from source files
merge datasets
reduce memory
Column 'row_id' converted from int64 to int32
Column 'building_id' converted from int64 to int16
Column 'meter' converted from int64 to int8
Column 'site_id' converted from int64 to int8
Column 'primary_use' converted from object to category
Column 'square_feet' converted from int64 to int32
Column 'year_built' converted from float64 to float32
Column 'floor_count' converted from float64 to float32
Column 'air_temperature' converted from float64 to float32
Column 'cloud_coverage' converted from float64 to float32
Column 'dew_temperature' converted from float64 to float32
Column 'precip_depth_1_hr' converted from float64 to float32
Column 'sea_level_pressure' converted from float64 to float32
Column 'wind_direction' converted from float64 to float32
Column 'wind_speed' converted from float64 to float32
Memory usage decreased from 7848.49MB to 2584.79MB (5263.70MB, 67.07% reduction)


In [27]:
y_pred = cv.predict(test_sel)
df_test['meter_reading'] = np.expm1(y_pred)
df_test[['row_id', 'meter_reading']].to_csv('submission_1.csv.gz', index=False, compression='gzip')
df_test.shape

(41697600, 22)