In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from datetime import datetime

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_pickle("../input/consumption_day.pkl")

print(df.shape)
df.head()

(3248, 366)


Unnamed: 0,meter_id,2017-01-01,2017-01-02,2017-01-03,2017-01-04,2017-01-05,2017-01-06,2017-01-07,2017-01-08,2017-01-09,...,2017-12-22,2017-12-23,2017-12-24,2017-12-25,2017-12-26,2017-12-27,2017-12-28,2017-12-29,2017-12-30,2017-12-31
0,0xa62b9f23553ff183f61e2bf943aab3d5983d02d7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.03,5.397,0.0,4.818,3.931,0.0,4.503,0.0,5.129,5.395
1,0x459c834d1f6cfb5b734b82aa9f5410fa97fb70da,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13.101,14.327,0.0,14.936,16.174,0.0,24.618,0.0,15.167,11.751
2,0x4a1ed36825360a058cec2bdd409fc2459e1ce54f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.201,7.32,0.0,7.384,14.425,0.0,17.705,0.0,8.966,4.633
3,0x5b76d3c0e0aefc6e0a8d1d031f96388a23263407,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,14.833,12.477,0.0,10.974,19.646,0.0,23.993,0.0,15.841,14.452
4,0x943ebe39ef2be6ef807c42c5a647e27112ca5b0f,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,39.44,35.538,0.0,8.351,9.957,0.0,25.871,0.0,46.274,16.901


In [3]:
start = datetime(2017, 1, 1)
end = datetime(2017, 12, 31)
date_range = pd.date_range(start, end)

df_train = pd.DataFrame(columns=["meter_id", "date", "meter_reading"])

for meter_idx in tqdm(df["meter_id"].values):
    
    df_meter_idx = pd.DataFrame(columns=["meter_id", "date", "meter_reading"])

    df_meter_idx["date"] = date_range
    df_meter_idx["meter_id"] = meter_idx
    df_meter_idx["meter_reading"] = df[df.meter_id == meter_idx].stack().values[1:].astype(float)

    df_train = pd.concat([df_train, df_meter_idx]).reset_index(drop=True)

df_train = df_train.sort_values(["meter_id", "date"]).reset_index(drop=True)
df_train["meter_reading"] = df_train["meter_reading"].round(3)
print(df_train.shape)
df_train.head()

HBox(children=(IntProgress(value=0, max=3248), HTML(value='')))


(1185520, 3)


Unnamed: 0,meter_id,date,meter_reading
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-01-01,0.0
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-01-02,0.0
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-01-03,0.0
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-01-04,0.0
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-01-05,0.0


In [4]:
# remove missing data
df_train["meter_reading_rolling_7"] = df_train.groupby(["meter_id"])["meter_reading"].rolling(3).mean().fillna(-1).values
df_train = df_train[df_train.meter_reading_rolling_7 > 0.01].reset_index(drop=True)
df_train.drop(["meter_reading_rolling_7"], axis=1, inplace=True)
print(df_train.shape)
df_train.head()

(636307, 3)


Unnamed: 0,meter_id,date,meter_reading
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-01,4.143
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-02,4.116
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-03,4.101
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-04,4.124
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-05,4.111


In [5]:
# get time feature
df_train["month"] = df_train["date"].dt.month.astype(int)
df_train["day_of_month"] = df_train["date"].dt.day.astype(int)
df_train["day_of_week"] = df_train["date"].dt.dayofweek.astype(int)

print(df_train.shape)
df_train.head()

(636307, 6)


Unnamed: 0,meter_id,date,meter_reading,month,day_of_month,day_of_week
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-01,4.143,8,1,1
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-02,4.116,8,2,2
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-03,4.101,8,3,3
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-04,4.124,8,4,4
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-05,4.111,8,5,5


In [6]:
# mean year/month meter reading
meter_reading_mean = df_train.groupby(["meter_id"])["meter_reading"].mean().rename("meter_reading_mean").round(3)
df_train = df_train.merge(meter_reading_mean, on="meter_id", how="left")
df_train["meter_reading_mean"].fillna(0, inplace=True)
print(f"meter map length is {len(meter_reading_mean)}")
print(set(df.meter_id) - set(meter_reading_mean.index.values))

print(df_train.shape)
df_train.head()

meter map length is 3247
{'0x81fa8eddb2b09393d3719984ca5520cb50f45efd'}
(636307, 7)


Unnamed: 0,meter_id,date,meter_reading,month,day_of_month,day_of_week,meter_reading_mean
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-01,4.143,8,1,1,3.241
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-02,4.116,8,2,2,3.241
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-03,4.101,8,3,3,3.241
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-04,4.124,8,4,4,3.241
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2017-08-05,4.111,8,5,5,3.241


In [7]:
test_start = datetime(2018, 1, 1)
test_end = datetime(2018, 12, 31)
test_date_range = pd.date_range(test_start, test_end)

df_test = pd.DataFrame(columns=["meter_id", "date", "meter_reading"])

for meter_idx in tqdm(df["meter_id"].values):
    
    df_meter_idx = pd.DataFrame(columns=["meter_id", "date", "meter_reading"])

    df_meter_idx["date"] = test_date_range
    df_meter_idx["meter_id"] = meter_idx
    df_meter_idx["meter_reading"] = 0 # target

    df_test = pd.concat([df_test, df_meter_idx]).reset_index(drop=True)

df_test = df_test.sort_values(["meter_id", "date"]).reset_index(drop=True)

print(df_test.shape)
df_test.head()

HBox(children=(IntProgress(value=0, max=3248), HTML(value='')))


(1185520, 3)


Unnamed: 0,meter_id,date,meter_reading
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-01,0
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-02,0
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-03,0
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-04,0
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-05,0


In [8]:
df_test["month"] = df_test["date"].dt.month.astype(int)
df_test["day_of_month"] = df_test["date"].dt.day.astype(int)
df_test["day_of_week"] = df_test["date"].dt.dayofweek.astype(int)

print(df_test.shape)
df_test.head()

(1185520, 6)


Unnamed: 0,meter_id,date,meter_reading,month,day_of_month,day_of_week
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-01,0,1,1,0
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-02,0,1,2,1
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-03,0,1,3,2
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-04,0,1,4,3
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-05,0,1,5,4


In [9]:
# mean year meter reading
df_test = df_test.merge(meter_reading_mean, on="meter_id", how="left")
df_test["meter_reading_mean"].fillna(0, inplace=True)

print(df_test.shape)
df_test.head()

(1185520, 7)


Unnamed: 0,meter_id,date,meter_reading,month,day_of_month,day_of_week,meter_reading_mean
0,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-01,0,1,1,0,3.241
1,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-02,0,1,2,1,3.241
2,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-03,0,1,3,2,3.241
3,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-04,0,1,4,3,3.241
4,0x0001f1c389823f953b2eaee0a61c33539744da0c,2018-01-05,0,1,5,4,3.241


In [10]:
print(df_train.shape, df_test.shape)

# df_train.to_pickle("../input/df_train.pkl")
# df_test.to_pickle("../input/df_test.pkl")
# print("train/test saved to pickle!")

(636307, 7) (1185520, 7)
