# Obtain activity
The main objective of this task is to label each day consumption depending on its activity. We want to model two types of days based on the consumption activity of each building:
- **Active** day. Usual consumption during working days.
- **Inactive** day. Usual consuomption during holidays.

This will be achieved by performing k-means (for 2 clusters), every building by its own.

After that, different types of consumers will be tagged, based on percentiles: high consumers, medium consumers and low consumers in the case of active consumptions; high consumers and low consumers, in the case of inactive consumptions.

In [1]:
CONS_PATH = '../data/'

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [3]:
raw = pd.read_csv(CONS_PATH + 'raw_consumptions.csv', index_col='day', converters={'consumptions': lambda x: list(map(float, x.strip('[]').split()))}, na_values='nan', parse_dates=True, infer_datetime_format=True)
raw

Unnamed: 0_level_0,building_id,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1
2011-07-26,27,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2011-07-27,27,"[17.0, 19.0, 18.35079465, 35.84631282, 47.8462..."
2011-07-28,27,"[18.88870418, 18.80300889, 18.84589226, 35.845..."
2011-07-29,27,"[20.0, 21.0, 20.0, 37.78877899, 45.84570469, 5..."
2011-07-30,27,"[17.29811321, 17.0, 17.23969745, 17.84583302, ..."
...,...,...
2020-05-10,2233,"[8.69106633, 8.54560612, 8.13895298, 8.8050705..."
2020-05-11,2233,"[8.57166388, 10.30621946, 10.4596744, 12.97239..."
2020-05-12,2233,"[8.31569322, 9.21411531, 10.19029213, 12.96810..."
2020-05-13,2233,"[8.71920264, 9.09413051, 10.02307814, 11.23125..."


Obtain weekday

In [4]:
days = raw.index.drop_duplicates().tolist()

raw.insert(1, 'weekday', -1)
for day in days:
    raw.loc[day, 'weekday'] = day.weekday()
    
raw

Unnamed: 0_level_0,building_id,weekday,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-07-26,27,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2011-07-27,27,2,"[17.0, 19.0, 18.35079465, 35.84631282, 47.8462..."
2011-07-28,27,3,"[18.88870418, 18.80300889, 18.84589226, 35.845..."
2011-07-29,27,4,"[20.0, 21.0, 20.0, 37.78877899, 45.84570469, 5..."
2011-07-30,27,5,"[17.29811321, 17.0, 17.23969745, 17.84583302, ..."
...,...,...,...
2020-05-10,2233,6,"[8.69106633, 8.54560612, 8.13895298, 8.8050705..."
2020-05-11,2233,0,"[8.57166388, 10.30621946, 10.4596744, 12.97239..."
2020-05-12,2233,1,"[8.31569322, 9.21411531, 10.19029213, 12.96810..."
2020-05-13,2233,2,"[8.71920264, 9.09413051, 10.02307814, 11.23125..."


In [5]:
def dropNan(df: pd.DataFrame) -> pd.DataFrame:
    nan_rows = []
    
    for i in df.index:
        if True in np.isnan(df['consumptions'].loc[i]):
            nan_rows.append(i)
            
    return df.drop(index=nan_rows)

In [6]:
raw.reset_index(inplace=True)
raw = dropNan(raw)
raw.set_index('day', inplace=True)
raw

Unnamed: 0_level_0,building_id,weekday,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2011-07-27,27,2,"[17.0, 19.0, 18.35079465, 35.84631282, 47.8462..."
2011-07-28,27,3,"[18.88870418, 18.80300889, 18.84589226, 35.845..."
2011-07-29,27,4,"[20.0, 21.0, 20.0, 37.78877899, 45.84570469, 5..."
2011-07-30,27,5,"[17.29811321, 17.0, 17.23969745, 17.84583302, ..."
2011-07-31,27,6,"[18.84559887, 17.15385255, 18.0, 18.0, 18.0, 2..."
...,...,...,...
2020-05-09,2233,5,"[9.50106059, 8.96931442, 8.60189324, 9.1773389..."
2020-05-11,2233,0,"[8.57166388, 10.30621946, 10.4596744, 12.97239..."
2020-05-12,2233,1,"[8.31569322, 9.21411531, 10.19029213, 12.96810..."
2020-05-13,2233,2,"[8.71920264, 9.09413051, 10.02307814, 11.23125..."


In [7]:
counter_id = 487 # Counter ID example

raw_df = raw[raw['building_id'] == counter_id]
raw_df

Unnamed: 0_level_0,building_id,weekday,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-12-18,487,2,"[12.10293213, 12.10293213, 12.10293213, 12.102..."
2013-12-20,487,4,"[10.9838824, 10.9838824, 10.9838824, 10.983882..."
2013-12-21,487,5,"[6.93115242, 7.59915394, 7.59915394, 7.5991539..."
2013-12-22,487,6,"[7.59915394, 7.59915394, 7.59915394, 7.5991539..."
2013-12-23,487,0,"[7.59915394, 7.59915394, 7.59915394, 7.5991539..."
...,...,...,...
2020-05-25,487,0,"[11.0, 9.0, 9.01754998, 12.98245002, 12.0, 12...."
2020-05-26,487,1,"[10.0, 9.0, 9.0, 12.7908828, 12.2091172, 13.0,..."
2020-05-27,487,2,"[9.59337189, 9.0, 9.0, 11.0, 12.56089413, 12.4..."
2020-05-28,487,3,"[9.17706204, 9.82293796, 9.0, 11.0, 12.0, 12.3..."


In [8]:
aux_df = pd.DataFrame(columns=['h' + str(i) for i in range(24)])
aux_df[['h' + str(i) for i in range(24)]] = raw_df.apply(lambda x: x[-1], axis=1, result_type='expand')
aux_df

Unnamed: 0_level_0,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,...,h14,h15,h16,h17,h18,h19,h20,h21,h22,h23
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-12-18,12.102932,12.102932,12.102932,12.102932,12.102932,12.102932,12.102932,12.102932,12.102932,11.536983,...,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909
2013-12-20,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,...,9.000000,8.000000,6.000000,8.000000,6.506118,6.493882,7.000000,7.000000,7.000000,7.000000
2013-12-21,6.931152,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,...,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154
2013-12-22,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,...,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154
2013-12-23,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,...,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-25,11.000000,9.000000,9.017550,12.982450,12.000000,12.000000,14.000000,12.000000,11.000000,11.000000,...,9.000000,11.000000,8.000000,9.000000,10.000000,8.000000,9.000000,10.000000,9.000000,9.000000
2020-05-26,10.000000,9.000000,9.000000,12.790883,12.209117,13.000000,14.000000,13.000000,12.000000,11.000000,...,10.000000,10.000000,9.000000,9.000000,9.000000,9.000000,9.000000,9.745043,9.254957,8.406628
2020-05-27,9.593372,9.000000,9.000000,11.000000,12.560894,12.439106,13.000000,12.000000,11.000000,10.000000,...,10.000000,10.000000,9.000000,9.000000,10.000000,9.000000,9.000000,10.000000,9.000000,10.000000
2020-05-28,9.177062,9.822938,9.000000,11.000000,12.000000,12.330907,12.669093,11.000000,11.823270,10.176730,...,11.000000,10.000000,10.000000,10.000000,10.000000,9.000000,10.000000,10.000000,9.000000,9.000000


Remove outliers

In [9]:
def removeOutliers(df: pd.DataFrame) -> pd.DataFrame:
    for col in df.columns:
        if df.empty:
            return df
        
        q3 = np.percentile(df[col], 75)
        q1 = np.percentile(df[col], 25)

        iqr = q3 - q1

        maximum = np.percentile(df[col], 75) + 1.5 * iqr
        minimum = np.percentile(df[col], 25) - 1.5 * iqr

        df = df[(df[col] > minimum) & (df[col] < maximum)]
    
    return df

In [10]:
aux_df = removeOutliers(aux_df)
aux_df

Unnamed: 0_level_0,h0,h1,h2,h3,h4,h5,h6,h7,h8,h9,...,h14,h15,h16,h17,h18,h19,h20,h21,h22,h23
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2013-12-18,12.102932,12.102932,12.102932,12.102932,12.102932,12.102932,12.102932,12.102932,12.102932,11.536983,...,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909,11.264909
2013-12-20,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,10.983882,...,9.000000,8.000000,6.000000,8.000000,6.506118,6.493882,7.000000,7.000000,7.000000,7.000000
2013-12-21,6.931152,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,...,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154
2013-12-22,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,...,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154
2013-12-23,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,...,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154,7.599154
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020-05-25,11.000000,9.000000,9.017550,12.982450,12.000000,12.000000,14.000000,12.000000,11.000000,11.000000,...,9.000000,11.000000,8.000000,9.000000,10.000000,8.000000,9.000000,10.000000,9.000000,9.000000
2020-05-26,10.000000,9.000000,9.000000,12.790883,12.209117,13.000000,14.000000,13.000000,12.000000,11.000000,...,10.000000,10.000000,9.000000,9.000000,9.000000,9.000000,9.000000,9.745043,9.254957,8.406628
2020-05-27,9.593372,9.000000,9.000000,11.000000,12.560894,12.439106,13.000000,12.000000,11.000000,10.000000,...,10.000000,10.000000,9.000000,9.000000,10.000000,9.000000,9.000000,10.000000,9.000000,10.000000
2020-05-28,9.177062,9.822938,9.000000,11.000000,12.000000,12.330907,12.669093,11.000000,11.823270,10.176730,...,11.000000,10.000000,10.000000,10.000000,10.000000,9.000000,10.000000,10.000000,9.000000,9.000000


In [11]:
raw_df = raw_df[raw_df.index.isin(aux_df.index)]
raw_df

Unnamed: 0_level_0,building_id,weekday,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-12-18,487,2,"[12.10293213, 12.10293213, 12.10293213, 12.102..."
2013-12-20,487,4,"[10.9838824, 10.9838824, 10.9838824, 10.983882..."
2013-12-21,487,5,"[6.93115242, 7.59915394, 7.59915394, 7.5991539..."
2013-12-22,487,6,"[7.59915394, 7.59915394, 7.59915394, 7.5991539..."
2013-12-23,487,0,"[7.59915394, 7.59915394, 7.59915394, 7.5991539..."
...,...,...,...
2020-05-25,487,0,"[11.0, 9.0, 9.01754998, 12.98245002, 12.0, 12...."
2020-05-26,487,1,"[10.0, 9.0, 9.0, 12.7908828, 12.2091172, 13.0,..."
2020-05-27,487,2,"[9.59337189, 9.0, 9.0, 11.0, 12.56089413, 12.4..."
2020-05-28,487,3,"[9.17706204, 9.82293796, 9.0, 11.0, 12.0, 12.3..."


In [12]:
raw_df['total_cons'] = raw_df['consumptions'].apply(np.nansum)
raw_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,building_id,weekday,consumptions,total_cons
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013-12-18,487,2,"[12.10293213, 12.10293213, 12.10293213, 12.102...",278.172099
2013-12-20,487,4,"[10.9838824, 10.9838824, 10.9838824, 10.983882...",225.807653
2013-12-21,487,5,"[6.93115242, 7.59915394, 7.59915394, 7.5991539...",181.711693
2013-12-22,487,6,"[7.59915394, 7.59915394, 7.59915394, 7.5991539...",182.379695
2013-12-23,487,0,"[7.59915394, 7.59915394, 7.59915394, 7.5991539...",182.379695
...,...,...,...,...
2020-05-25,487,0,"[11.0, 9.0, 9.01754998, 12.98245002, 12.0, 12....",245.000000
2020-05-26,487,1,"[10.0, 9.0, 9.0, 12.7908828, 12.2091172, 13.0,...",249.406628
2020-05-27,487,2,"[9.59337189, 9.0, 9.0, 11.0, 12.56089413, 12.4...",245.593372
2020-05-28,487,3,"[9.17706204, 9.82293796, 9.0, 11.0, 12.0, 12.3...",249.000000


In [13]:
X = raw_df['total_cons'].values.reshape(-1, 1)

scaler = StandardScaler()

X = scaler.fit_transform(X)

km = KMeans(n_clusters=2).fit(X)

In [14]:
raw_df.insert(2, 'active', True)
for i in range(raw_df.shape[0]):
    if km.cluster_centers_[0] < km.cluster_centers_[1]:
        if km.labels_[i] == 0:
            raw_df['active'].iloc[i] = False
    else:
        if km.labels_[i] == 1:
            raw_df['active'].iloc[i] = False
            
raw_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0_level_0,building_id,weekday,active,consumptions,total_cons
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-12-18,487,2,True,"[12.10293213, 12.10293213, 12.10293213, 12.102...",278.172099
2013-12-20,487,4,False,"[10.9838824, 10.9838824, 10.9838824, 10.983882...",225.807653
2013-12-21,487,5,False,"[6.93115242, 7.59915394, 7.59915394, 7.5991539...",181.711693
2013-12-22,487,6,False,"[7.59915394, 7.59915394, 7.59915394, 7.5991539...",182.379695
2013-12-23,487,0,False,"[7.59915394, 7.59915394, 7.59915394, 7.5991539...",182.379695
...,...,...,...,...,...
2020-05-25,487,0,True,"[11.0, 9.0, 9.01754998, 12.98245002, 12.0, 12....",245.000000
2020-05-26,487,1,True,"[10.0, 9.0, 9.0, 12.7908828, 12.2091172, 13.0,...",249.406628
2020-05-27,487,2,True,"[9.59337189, 9.0, 9.0, 11.0, 12.56089413, 12.4...",245.593372
2020-05-28,487,3,True,"[9.17706204, 9.82293796, 9.0, 11.0, 12.0, 12.3...",249.000000


Defining consumption type. 0 means lowest consumer

In [15]:
raw = pd.read_pickle(CONS_PATH + 'clean_consumptions.zip')
raw

Unnamed: 0_level_0,building_id,weekday,active,consumptions,total_cons
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-07-26,27,1,False,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ...",368.000000
2011-07-27,27,2,True,"[17.0, 19.0, 18.35079465, 35.84631282, 47.8462...",1376.111296
2011-07-28,27,3,True,"[18.88870418, 18.80300889, 18.84589226, 35.845...",1432.888704
2011-07-29,27,4,True,"[20.0, 21.0, 20.0, 37.78877899, 45.84570469, 5...",1374.701887
2011-07-30,27,5,False,"[17.29811321, 17.0, 17.23969745, 17.84583302, ...",451.298662
...,...,...,...,...,...
2020-05-10,2233,6,False,"[8.69106633, 8.54560612, 8.13895298, 8.8050705...",201.552366
2020-05-11,2233,0,False,"[8.57166388, 10.30621946, 10.4596744, 12.97239...",266.428660
2020-05-12,2233,1,False,"[8.31569322, 9.21411531, 10.19029213, 12.96810...",253.456566
2020-05-13,2233,2,False,"[8.71920264, 9.09413051, 10.02307814, 11.23125...",266.154979


In [16]:
buildings_df = pd.DataFrame()

for counter_id in raw['building_id'].unique():
    building = raw[raw['building_id'] == counter_id]
    
    actives = building['total_cons'][building['active']]
    inactives = building['total_cons'][building['active'] == False]
    
    buildings_df = buildings_df.append(pd.DataFrame({'building_id': counter_id, 'active': [True, False], 'mean_cons': [actives.mean(), inactives.mean()]}), ignore_index=True)
    
buildings_df

Unnamed: 0,building_id,active,mean_cons
0,27,True,1164.853429
1,27,False,522.576203
2,28,True,1133.045919
3,28,False,534.186961
4,37,True,1902.723098
...,...,...,...
189,2209,False,223.679921
190,2210,True,2002.971497
191,2210,False,284.642782
192,2233,True,481.260526


In [17]:
def get_consumption_type(df: pd.DataFrame, n: int):
    increment = 100 / n

    types = []
    for i in range(n):
        if i == n - 1:
            atype = df[df['mean_cons'] >= np.percentile(df['mean_cons'], increment * i)]
        else:
            atype = df[(df['mean_cons'] >= np.percentile(df['mean_cons'], increment * i)) & (df['mean_cons'] < np.percentile(df['mean_cons'], increment * (i + 1)))]

        atype['type'] = i

        types.append(atype)
        
    return pd.concat(types)

In [18]:
actives = get_consumption_type(buildings_df[buildings_df['active']], 3)
inactives = get_consumption_type(buildings_df[buildings_df['active'] == False], 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [19]:
types = pd.concat([actives, inactives])
types

Unnamed: 0,building_id,active,mean_cons,type
24,86,True,156.795554,0
26,88,True,134.334077,0
44,426,True,51.643651,0
48,436,True,64.594253,0
50,437,True,17.152971,0
...,...,...,...,...
185,2207,False,2156.218538,1
187,2208,False,344.124138,1
189,2209,False,223.679921,1
191,2210,False,284.642782,1


In [20]:
raw = raw.merge(types, on=['building_id', 'active'], how='left').set_index(raw.index)
raw = raw[['building_id', 'weekday', 'active', 'type', 'consumptions']]

raw

Unnamed: 0_level_0,building_id,weekday,active,type,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2011-07-26,27,1,False,1,"[nan, nan, nan, nan, nan, nan, nan, nan, nan, ..."
2011-07-27,27,2,True,2,"[17.0, 19.0, 18.35079465, 35.84631282, 47.8462..."
2011-07-28,27,3,True,2,"[18.88870418, 18.80300889, 18.84589226, 35.845..."
2011-07-29,27,4,True,2,"[20.0, 21.0, 20.0, 37.78877899, 45.84570469, 5..."
2011-07-30,27,5,False,1,"[17.29811321, 17.0, 17.23969745, 17.84583302, ..."
...,...,...,...,...,...
2020-05-10,2233,6,False,1,"[8.69106633, 8.54560612, 8.13895298, 8.8050705..."
2020-05-11,2233,0,False,1,"[8.57166388, 10.30621946, 10.4596744, 12.97239..."
2020-05-12,2233,1,False,1,"[8.31569322, 9.21411531, 10.19029213, 12.96810..."
2020-05-13,2233,2,False,1,"[8.71920264, 9.09413051, 10.02307814, 11.23125..."
