In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from datetime import datetime
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn import preprocessing
import lightgbm as lgb
from sklearn.metrics import mean_absolute_error

import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_rows', 100)

In [2]:
df_train = pd.read_pickle("../input/df_train_clean.pkl")
df_test = pd.read_pickle("../input/df_test_clean.pkl")

print(df_train.shape, df_test.shape)
df_train.head()

(627584, 7) (1185520, 7)


Unnamed: 0,meter_id,date,meter_reading,month,day_of_month,day_of_week,meter_reading_mean
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-01,4.143,8,1,1,3.241
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-02,4.116,8,2,2,3.241
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-03,4.101,8,3,3,3.241
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-04,4.124,8,4,4,3.241
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-05,4.111,8,5,5,3.241


In [3]:
le = preprocessing.LabelEncoder()
le.fit(df_test["meter_id"])

df_train["meter_id_code"] = le.transform(df_train["meter_id"])
df_test["meter_id_code"] = le.transform(df_test["meter_id"])

df_train.head()

Unnamed: 0,meter_id,date,meter_reading,month,day_of_month,day_of_week,meter_reading_mean,meter_id_code
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-01,4.143,8,1,1,3.241,0
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-02,4.116,8,2,2,3.241,0
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-03,4.101,8,3,3,3.241,0
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-04,4.124,8,4,4,3.241,0
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-05,4.111,8,5,5,3.241,0


In [4]:
for df in [df_train, df_test]:
    df['month_sin'] = np.sin(2 * np.pi * df['month']/11.0)
    df['month_cos'] = np.cos(2 * np.pi * df['month']/11.0)

TARGET = "meter_reading"

use_cols = ["meter_id_code", "day_of_week", "day_of_month", "month", "meter_reading_mean", "month_sin", "month_cos"]
print(f"used cols are: {use_cols}")

cat_cols = ["meter_id_code", "day_of_week", "day_of_month", "month"]
print(f"categorical cols are: {cat_cols}")

for col in cat_cols:
    df_train[col] = df_train[col].astype('category')
    df_test[col] = df_test[col].astype('category')

used cols are: ['meter_id_code', 'day_of_week', 'day_of_month', 'month', 'meter_reading_mean', 'month_sin', 'month_cos']
categorical cols are: ['meter_id_code', 'day_of_week', 'day_of_month', 'month']


In [5]:
param = {
    'metric': 'mae',
    'num_leaves': 256,
    'learning_rate': 0.01,
    'n_estimators': 10000,
    'subsample': 0.85, 
    "lambda_l1": 16,
    "lambda_l2": 8,
    "max_depth":8,
    'seed': 42,
    'n_jobs':-1
}

In [6]:
cv = 3
kf = KFold(n_splits=cv, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)

for train_index, valid_index in skf.split(df_train, df_train["meter_id"]):

    X_train, X_valid = df_train.loc[train_index, use_cols], df_train.loc[valid_index, use_cols]
    y_train, y_valid = df_train.loc[train_index, TARGET], df_train.loc[valid_index, TARGET]

    model_lgb = lgb.LGBMRegressor(**param)
    model_lgb.fit(X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        categorical_feature=cat_cols,
        early_stopping_rounds=200,
        verbose=1000)

    oof_valid = model_lgb.predict(X_valid, num_iteration=model_lgb.best_iteration_)
    df_train.loc[valid_index, "oof"] = oof_valid
    
    mae = np.sqrt(mean_absolute_error(y_valid, oof_valid))
    
    #TEST
    df_test[TARGET] += model_lgb.predict(df_test[use_cols], num_iteration=model_lgb.best_iteration_) / cv

df_test["meter_reading"] = df_test.meter_reading.clip(lower=0)
oof_mae = np.sqrt(mean_absolute_error(df_train[TARGET], df_train["oof"]))
print("#" * 50)
print(f"OOF MAE SCORE IS: {oof_mae:0.3f}") #1.362

Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's l1: 1.9747
[2000]	valid_0's l1: 1.91533
[3000]	valid_0's l1: 1.88726
[4000]	valid_0's l1: 1.87115
[5000]	valid_0's l1: 1.86503
[6000]	valid_0's l1: 1.86141
[7000]	valid_0's l1: 1.85909
Early stopping, best iteration is:
[7036]	valid_0's l1: 1.85886
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's l1: 1.96467
[2000]	valid_0's l1: 1.90923
[3000]	valid_0's l1: 1.8774
[4000]	valid_0's l1: 1.86557
[5000]	valid_0's l1: 1.85807
[6000]	valid_0's l1: 1.85362
[7000]	valid_0's l1: 1.85151
Early stopping, best iteration is:
[7043]	valid_0's l1: 1.85147
Training until validation scores don't improve for 200 rounds.
[1000]	valid_0's l1: 1.94909
[2000]	valid_0's l1: 1.90234
[3000]	valid_0's l1: 1.87547
[4000]	valid_0's l1: 1.86008
[5000]	valid_0's l1: 1.85492
Early stopping, best iteration is:
[5197]	valid_0's l1: 1.85414
##################################################
OOF MAE SCORE IS: 1

In [7]:
# Write to submission
df_month = df_test.groupby(["meter_id", "month"])["meter_reading"].sum().unstack()

month_cols = ["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"]
sub = pd.DataFrame(columns=["meter_id"] + month_cols)

sub["meter_id"] = df_month.index.values
sub[month_cols] = df_month.values.round()

# sub.to_csv("../submissions/lgb_baseline_clean.csv", index=False, header=True)

In [8]:
#BELOW IS FOR VISUALIZATION

df_month_group = df_train.groupby(["meter_id", "month"])["meter_reading"].sum().unstack()
df_train_month = pd.DataFrame(columns=["meter_id"] + month_cols)

df_train_month["meter_id"] = df_month_group.index.values
df_train_month[month_cols] = df_month_group.values.round(3)
for meter_x in set(df_test.meter_id) - set(df_train_month.meter_id):
    df_train_month = df_train_month.append({"meter_id": meter_x}, ignore_index=True)
df_train_month.fillna(0, inplace=True)

In [10]:
# df = pd.read_pickle("../input/consumption_month.pkl")
# meter_id_list = df.meter_id.values

# # VISUALIZE PREDICTIONS
# for idx, meter_idx in enumerate(tqdm(meter_id_list)):

#     df_meter_idx = df_train[df_train.meter_id == meter_idx]

#     fig = plt.figure(figsize=(21,18))
#     plt.subplot(2, 1, 1)
#     plt.plot(range(12), df_train_month[df_train_month.meter_id == meter_idx].values.reshape([-1])[1:], lw=5, label="train")
#     plt.plot(range(12), df_train_month[df_train_month.meter_id == meter_idx].values.reshape([-1])[1:], ".b", ms=25)

#     plt.plot(range(12, 24), sub[sub.meter_id == meter_idx].values.reshape([-1])[1:], lw=5, label="prediction")
#     plt.plot(range(12, 24), sub[sub.meter_id == meter_idx].values.reshape([-1])[1:], ".g", ms=25)

#     plt.grid()
#     plt.legend(fontsize=15)
#     plt.title(f"{idx}-{meter_idx}", fontsize=25)
    
#     ######################################################
#     plt.subplot(2, 1, 2)

#     df_meter_idx["date"] = pd.to_datetime(df_meter_idx["date"])
#     df_meter_idx.set_index("date", inplace=True)

#     df_test_meter_idx = df_test[df_test.meter_id == meter_idx]

#     df_test_meter_idx["date"] = pd.to_datetime(df_test_meter_idx["date"])
#     df_test_meter_idx.set_index("date", inplace=True)

#     plt.plot(df_meter_idx.meter_reading, lw=3, c="b", alpha=1, label="train")
#     plt.plot(df_test_meter_idx.meter_reading, lw=3, c="g", alpha=1, label="prediction")

#     plt.xlim([datetime(2017, 1, 1), datetime(2018, 12, 31)])

#     plt.grid()
#     plt.title(meter_idx, fontsize=25)
#     plt.legend(fontsize=15)

#     plt.savefig(f"../eda/lgb_clean_visual_check/{idx}-{meter_idx}.png")