# Obtain activity
The main objective of this task is to label each day consumption depending on its activity. We want to model two types of days based on the consumption activity of each building:
- **Active** day. Usual consumption during working days.
- **Inactive** day. Usual consuomption during holidays.

This will be achieved by performing k-means (for 2 clusters), every building by its own.

After that, different types of consumers will be tagged, based on percentiles: high consumers, medium consumers and low consumers in the case of active consumptions; high consumers and low consumers, in the case of inactive consumptions.

In [1]:
CONS_PATH = 'C:/Users/thmas/OneDrive - Universidad de Castilla-La Mancha/Informática/TFG/out/'

In [2]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [3]:
raw = pd.read_pickle(CONS_PATH + 'raw_consumptions.zip')
raw

Unnamed: 0_level_0,building_id,weekday,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2012-02-24,89,4,"[nan, nan, nan, nan, 0.0, 25.9682072759303, 34..."
2012-02-25,89,5,"[8.0, 8.56965980289508, 7.83041664589254, 7.83..."
2012-02-26,89,6,"[9.0, 9.0, 8.47872481882854, 8.52127518117146,..."
2012-02-27,89,0,"[9.93594069444675, 9.0, 10.0, 18.4133936140153..."
2012-02-28,89,1,"[15.0, 15.0, 15.0, 23.0, 41.3474893206788, 39...."
...,...,...,...
2020-03-28,2233,5,"[8.96294314928535, 9.1999884489703, 9.22916758..."
2020-03-29,2233,6,"[9.05122649923577, 9.10856876843712, 9.0668798..."
2020-03-30,2233,0,"[9.14786320617928, 9.46424320377272, 12.979311..."
2020-03-31,2233,1,"[9.09777728991234, 9.49875136817228, 13.959012..."


In [4]:
counters = raw['building_id'].unique()
counters

array([  89,   27,   49,   58,   69,   65,   63,   60,   37,   67,   82,
         91,  487,   81,  524,  317,  292,   28,  423,  451,  426,  430,
        436,  447,  455,  437,  293,  459,  469,  475,  652,  387, 1919,
        639,  642,  405,  651,  653,   86,   88,  501,  682,  506,  665,
        670,  694, 1926,  511,  687, 1925,  516,  520,  521,  522,  523,
        546,  556,  567,  568,  569,  560,  711,  703,  725,  746,  751,
        732,  742,  624,  625,  628,  629, 1866,  626, 1870,  630, 1939,
       1944, 2037, 2060, 2047, 2072, 2042, 2078, 2091, 2043, 2070, 2077,
       2076, 2207,  476, 2208, 2209, 2204, 2210,  737, 2233], dtype=int64)

In [5]:
counter_id = 487 # Counter ID example

raw_df = raw[raw['building_id'] == counter_id]
raw_df

Unnamed: 0_level_0,building_id,weekday,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2013-12-17,487,1,"[nan, nan, nan, nan, nan, nan, nan, 4.49733527..."
2013-12-18,487,2,"[12.1029321298894, 12.1029321298894, 12.102932..."
2013-12-19,487,3,"[11.264909064798, 11.264909064798, 11.26490906..."
2013-12-20,487,4,"[10.9838823956164, 10.9838823956164, 10.983882..."
2013-12-21,487,5,"[6.93115242178077, 7.59915393780765, 7.5991539..."
...,...,...,...
2020-03-28,487,5,"[8.0, 9.0, 8.0, 8.0, 9.0, 9.0, 9.0, 9.0, 9.0, ..."
2020-03-29,487,6,"[9.0, 8.0, 8.0, 9.0, 8.0, 8.0, 9.0, 9.0, 8.089..."
2020-03-30,487,0,"[10.0, 10.0, 11.0, 12.8805883330563, 12.119411..."
2020-03-31,487,1,"[8.0, 7.0, 7.0, 10.0, 11.0, 14.0, 14.0, 11.154..."


In [6]:
raw_df['total_cons'] = raw_df['consumptions'].apply(np.nansum)
raw_df['total_cons']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


day
2013-12-17    198.144249
2013-12-18    278.172099
2013-12-19    262.029504
2013-12-20    225.807653
2013-12-21    181.711693
                 ...    
2020-03-28    199.000000
2020-03-29    234.000000
2020-03-30    212.000000
2020-03-31    222.454595
2020-04-01    216.947150
Name: total_cons, Length: 2298, dtype: float64

Remove outliers

In [7]:
q3 = np.percentile(raw_df['total_cons'], 75)
q1 = np.percentile(raw_df['total_cons'], 25)

iqr = q3 - q1

maximum = np.percentile(raw_df['total_cons'], 75) + 1.5 * iqr
minimum = np.percentile(raw_df['total_cons'], 25) - 1.5 * iqr

raw_df = raw_df[(raw_df['total_cons'] > minimum) & (raw_df['total_cons'] < maximum)]

In [8]:
X = raw_df['total_cons'].values.reshape(-1, 1)

scaler = StandardScaler()

X = scaler.fit_transform(X)

km = KMeans(n_clusters=2).fit(X)

In [9]:
raw_df.insert(2, 'active', True)
for i in range(raw_df.shape[0]):
    if km.cluster_centers_[0] < km.cluster_centers_[1]:
        if km.labels_[i] == 0:
            raw_df['active'].iloc[i] = False
    else:
        if km.labels_[i] == 1:
            raw_df['active'].iloc[i] = False
            
raw_df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0_level_0,building_id,weekday,active,consumptions,total_cons
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2013-12-17,487,1,False,"[nan, nan, nan, nan, nan, nan, nan, 4.49733527...",198.144249
2013-12-18,487,2,True,"[12.1029321298894, 12.1029321298894, 12.102932...",278.172099
2013-12-19,487,3,True,"[11.264909064798, 11.264909064798, 11.26490906...",262.029504
2013-12-20,487,4,False,"[10.9838823956164, 10.9838823956164, 10.983882...",225.807653
2013-12-21,487,5,False,"[6.93115242178077, 7.59915393780765, 7.5991539...",181.711693
...,...,...,...,...,...
2020-03-28,487,5,False,"[8.0, 9.0, 8.0, 8.0, 9.0, 9.0, 9.0, 9.0, 9.0, ...",199.000000
2020-03-29,487,6,False,"[9.0, 8.0, 8.0, 9.0, 8.0, 8.0, 9.0, 9.0, 8.089...",234.000000
2020-03-30,487,0,False,"[10.0, 10.0, 11.0, 12.8805883330563, 12.119411...",212.000000
2020-03-31,487,1,False,"[8.0, 7.0, 7.0, 10.0, 11.0, 14.0, 14.0, 11.154...",222.454595


Defining consumption type. 0 means lowest consumer

In [10]:
raw = pd.read_pickle(CONS_PATH + 'consumptions.zip')
raw

Unnamed: 0_level_0,building_id,weekday,active,consumptions,total_cons
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-02-24,89,4,False,"[nan, nan, nan, nan, 0.0, 25.9682072759303, 34...",304.000000
2012-02-25,89,5,False,"[8.0, 8.56965980289508, 7.83041664589254, 7.83...",209.000000
2012-02-26,89,6,False,"[9.0, 9.0, 8.47872481882854, 8.52127518117146,...",212.064059
2012-02-27,89,0,False,"[9.93594069444675, 9.0, 10.0, 18.4133936140153...",488.935941
2012-02-28,89,1,False,"[15.0, 15.0, 15.0, 23.0, 41.3474893206788, 39....",554.000000
...,...,...,...,...,...
2020-03-28,2233,5,False,"[8.96294314928535, 9.1999884489703, 9.22916758...",211.029379
2020-03-29,2233,6,False,"[9.05122649923577, 9.10856876843712, 9.0668798...",220.640906
2020-03-30,2233,0,False,"[9.14786320617928, 9.46424320377272, 12.979311...",269.996718
2020-03-31,2233,1,False,"[9.09777728991234, 9.49875136817228, 13.959012...",268.458061


In [11]:
buildings_df = pd.DataFrame()

for counter_id in raw['building_id'].unique():
    building = raw[raw['building_id'] == counter_id]
    
    actives = building['total_cons'][building['active']]
    inactives = building['total_cons'][building['active'] == False]
    
    buildings_df = buildings_df.append(pd.DataFrame({'building_id': counter_id, 'active': [True, False], 'mean_cons': [actives.mean(), inactives.mean()]}), ignore_index=True)
    
buildings_df

Unnamed: 0,building_id,active,mean_cons
0,89,True,848.391144
1,89,False,491.823908
2,27,True,1166.341793
3,27,False,532.702991
4,49,True,784.583650
...,...,...,...
189,2210,False,284.642782
190,737,True,632.464069
191,737,False,346.436239
192,2233,True,484.342609


In [12]:
def get_consumption_type(df: pd.DataFrame, n: int):
    increment = 100 / n

    types = []
    for i in range(n):
        if i == n - 1:
            atype = df[df['mean_cons'] >= np.percentile(df['mean_cons'], increment * i)]
        else:
            atype = df[(df['mean_cons'] >= np.percentile(df['mean_cons'], increment * i)) & (df['mean_cons'] < np.percentile(df['mean_cons'], increment * (i + 1)))]

        atype['type'] = i

        types.append(atype)
        
    return pd.concat(types)

In [13]:
actives = get_consumption_type(buildings_df[buildings_df['active']], 3)
inactives = get_consumption_type(buildings_df[buildings_df['active'] == False], 2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


In [14]:
types = pd.concat([actives, inactives])
types

Unnamed: 0,building_id,active,mean_cons,type
28,524,True,6.860086,0
40,426,True,47.545042,0
44,436,True,64.594253,0
46,447,True,118.621613,0
50,437,True,17.152971,0
...,...,...,...,...
185,2209,False,232.656448,1
187,2204,False,1596.073605,1
189,2210,False,284.642782,1
191,737,False,346.436239,1


In [15]:
raw = raw.merge(types, on=['building_id', 'active'], how='left').set_index(raw.index)
raw = raw[['building_id', 'weekday', 'active', 'type', 'consumptions']]

raw

Unnamed: 0_level_0,building_id,weekday,active,type,consumptions
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2012-02-24,89,4,False,1,"[nan, nan, nan, nan, 0.0, 25.9682072759303, 34..."
2012-02-25,89,5,False,1,"[8.0, 8.56965980289508, 7.83041664589254, 7.83..."
2012-02-26,89,6,False,1,"[9.0, 9.0, 8.47872481882854, 8.52127518117146,..."
2012-02-27,89,0,False,1,"[9.93594069444675, 9.0, 10.0, 18.4133936140153..."
2012-02-28,89,1,False,1,"[15.0, 15.0, 15.0, 23.0, 41.3474893206788, 39...."
...,...,...,...,...,...
2020-03-28,2233,5,False,1,"[8.96294314928535, 9.1999884489703, 9.22916758..."
2020-03-29,2233,6,False,1,"[9.05122649923577, 9.10856876843712, 9.0668798..."
2020-03-30,2233,0,False,1,"[9.14786320617928, 9.46424320377272, 12.979311..."
2020-03-31,2233,1,False,1,"[9.09777728991234, 9.49875136817228, 13.959012..."
