In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from datetime import datetime
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 100)

In [2]:
df = pd.read_pickle("../input/consumption_month.pkl")
df["cluster"] = 0

cluster_cut = [0, 271, 542, 813, 1084, 1355, 1626, 1897, 2168, 2438, 2708, 2978, 3248]
for i in range(12):
    df.loc[cluster_cut[i]:cluster_cut[i+1], "cluster"] = i

In [3]:
df_train = pd.read_pickle("../input/df_train_clean.pkl")
df_test = pd.read_pickle("../input/df_test_clean.pkl")

print(df_train.shape, df_test.shape)
df_train.head()

(627584, 7) (1185520, 7)


Unnamed: 0,meter_id,date,meter_reading,month,day_of_month,day_of_week,meter_reading_mean
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-01,4.143,8,1,1,3.241
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-02,4.116,8,2,2,3.241
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-03,4.101,8,3,3,3.241
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-04,4.124,8,4,4,3.241
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-05,4.111,8,5,5,3.241


In [4]:
le = preprocessing.LabelEncoder()
le.fit(df_test["meter_id"])

df_train["meter_id_code"] = le.transform(df_train["meter_id"])
df_test["meter_id_code"] = le.transform(df_test["meter_id"])

df_train.head()

Unnamed: 0,meter_id,date,meter_reading,month,day_of_month,day_of_week,meter_reading_mean,meter_id_code
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-01,4.143,8,1,1,3.241,0
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-02,4.116,8,2,2,3.241,0
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-03,4.101,8,3,3,3.241,0
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-04,4.124,8,4,4,3.241,0
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-05,4.111,8,5,5,3.241,0


In [5]:
for df_t in [df_train, df_test]:
    df_t['month_sin'] = np.sin(2 * np.pi * df_t['month']/11.0)
    df_t['month_cos'] = np.cos(2 * np.pi * df_t['month']/11.0)

TARGET = "meter_reading"

use_cols = ["meter_id_code", "day_of_week", "day_of_month", "meter_reading_mean", "month_sin", "month_cos"]
print(f"used cols are: {use_cols}")

cat_cols = ["meter_id_code", "day_of_week", "day_of_month"]
print(f"categorical cols are: {cat_cols}")

for col in cat_cols:
    df_train[col] = df_train[col].astype('category')
    df_test[col] = df_test[col].astype('category')

used cols are: ['meter_id_code', 'day_of_week', 'day_of_month', 'meter_reading_mean', 'month_sin', 'month_cos']
categorical cols are: ['meter_id_code', 'day_of_week', 'day_of_month']


In [6]:
param = {
    'metric': 'mae',
    'num_leaves': 64,
    'learning_rate': 0.01,
    'n_estimators': 5000,
    'subsample': 0.5, 
    "lambda_l1": 8,
    "lambda_l2": 4,
    "max_depth":4,
    'seed': 42,
    'n_jobs':-1
}

cv = 3
kf = KFold(n_splits=cv, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

In [7]:
for cluster_idx in range(12):
    print("#" * 50)
    print(f"Training on cluster {cluster_idx}...")
    df_cluster_meter_list = df[df.cluster == cluster_idx].meter_id.values
    df_train_cluster_idx = df_train[df_train.meter_id.isin(df_cluster_meter_list)].reset_index(drop=True)

    for train_index, valid_index in skf.split(df_train_cluster_idx, df_train_cluster_idx["meter_id"]):

        X_train, X_valid = df_train_cluster_idx.loc[train_index, use_cols], df_train_cluster_idx.loc[valid_index, use_cols]
        y_train, y_valid = df_train_cluster_idx.loc[train_index, TARGET], df_train_cluster_idx.loc[valid_index, TARGET]

        model_lgb = lgb.LGBMRegressor(**param)
        model_lgb.fit(X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            categorical_feature=cat_cols,
            early_stopping_rounds=200,
            verbose=False)

        oof_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration_)
        df_train_cluster_idx.loc[valid_index, "oof"] = oof_valid

        mae = np.sqrt(mean_absolute_error(y_valid, oof_valid))

        #TEST
        df_test.loc[df_test.meter_id.isin(df_cluster_meter_list), TARGET] += model_lgb.predict(df_test.loc[df_test.meter_id.isin(df_cluster_meter_list), use_cols], num_iteration=model_lgb.best_iteration_) / cv
    
    oof_mae = np.sqrt(mean_absolute_error(df_train_cluster_idx[TARGET], df_train_cluster_idx["oof"]))
    print(f"OOF MAE SCORE IS: {oof_mae:0.3f}")

df_test["meter_reading"] = df_test.meter_reading.clip(lower=0)

##################################################
Training on cluster 0...
OOF MAE SCORE IS: 1.507
##################################################
Training on cluster 1...
OOF MAE SCORE IS: 1.561
##################################################
Training on cluster 2...
OOF MAE SCORE IS: 1.493
##################################################
Training on cluster 3...
OOF MAE SCORE IS: 1.513
##################################################
Training on cluster 4...
OOF MAE SCORE IS: 1.466
##################################################
Training on cluster 5...
OOF MAE SCORE IS: 1.389
##################################################
Training on cluster 6...
OOF MAE SCORE IS: 1.373
##################################################
Training on cluster 7...
OOF MAE SCORE IS: 1.356
##################################################
Training on cluster 8...
OOF MAE SCORE IS: 1.397
##################################################
Training on cluster 9...
OOF MAE SCORE IS: 1.425


In [8]:
# Write to submission
df_month = df_test.groupby(["meter_id", "month"])["meter_reading"].sum().unstack()

month_cols = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
sub = pd.DataFrame(columns=["meter_id"] + month_cols)

sub["meter_id"] = df_month.index.values
sub[month_cols] = df_month.values.round()

# sub.to_csv("../submissions/mini_lgb_baseline_clean.csv", index=False, header=True)

In [9]:
# df_month_group = df_train.groupby(["meter_id", "month"])["meter_reading"].sum().unstack()
# df_train_month = pd.DataFrame(columns=["meter_id"] + month_cols)

# df_train_month["meter_id"] = df_month_group.index.values
# df_train_month[month_cols] = df_month_group.values.round(3)
# for meter_x in set(df_test.meter_id) - set(df_train_month.meter_id):
#     df_train_month = df_train_month.append({"meter_id": meter_x}, ignore_index=True)
# df_train_month.fillna(0, inplace=True)

In [10]:
# df = pd.read_pickle("../input/consumption_month.pkl")
# meter_id_list = df.meter_id.values

# # VISUALIZE PREDICTIONS
# for idx, meter_idx in enumerate(tqdm(meter_id_list)):

#     df_meter_idx = df_train[df_train.meter_id == meter_idx]

#     fig = plt.figure(figsize=(21,18))
#     plt.subplot(2, 1, 1)
#     plt.plot(range(12), df_train_month[df_train_month.meter_id == meter_idx].values.reshape([-1])[1:], lw=5, label="train")
#     plt.plot(range(12), df_train_month[df_train_month.meter_id == meter_idx].values.reshape([-1])[1:], ".b", ms=25)

#     plt.plot(range(12, 24), sub[sub.meter_id == meter_idx].values.reshape([-1])[1:], lw=5, label="prediction")
#     plt.plot(range(12, 24), sub[sub.meter_id == meter_idx].values.reshape([-1])[1:], ".g", ms=25)
    
#     plt.grid()
#     plt.legend(fontsize=15)
#     plt.title(f"{idx}-{meter_idx}", fontsize=25)
    
#     ######################################################
#     plt.subplot(2, 1, 2)

#     df_meter_idx["date"] = pd.to_datetime(df_meter_idx["date"])
#     df_meter_idx.set_index("date", inplace=True)

#     df_test_meter_idx = df_test[df_test.meter_id == meter_idx]

#     df_test_meter_idx["date"] = pd.to_datetime(df_test_meter_idx["date"])
#     df_test_meter_idx.set_index("date", inplace=True)

#     plt.plot(df_meter_idx.meter_reading, lw=3, c="b", alpha=1, label="train")
#     plt.plot(df_test_meter_idx.meter_reading, lw=3, c="g", alpha=1, label="prediction")

#     plt.xlim([datetime(2017, 1, 1), datetime(2018, 12, 31)])

#     plt.grid()
#     plt.title(meter_idx, fontsize=25)
#     plt.legend(fontsize=15)

#     plt.savefig(f"../eda/mini_lgb_clean_visual_check/{idx}-{meter_idx}.png")