## Clustering 
Similar to https://www.citizensutilityboard.org/wp-content/uploads/2019/06/ClusterAnalysisFinal.pdf, except we have household level demographics

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd

import sys
import os
from pathlib import Path
from glob import glob

from sklearn.cluster import KMeans
import plotly.express as px

# add path to modules to run from command line
BASE_PATH = os.path.join(os.environ['HOME'], 'github/blog')
sys.path.append(BASE_PATH)
from energy_usage_forecasting.utils import load_usage_data, REPO_DIR

In [9]:
%%time
DIR = Path(REPO_DIR, f'data/raw/smart-meters-in-london/halfhourly_dataset/halfhourly_dataset')
files = os.listdir(DIR)
dfs = []
for f in files: 
    print(f)
    df = pd.read_csv(Path(DIR, f))
    df['date_time'] = pd.to_datetime(df['tstp']).dt.tz_localize(tz="GMT")
    df = df.loc[df['date_time'].between('2012-12-01', '2013-01-31')]
    df['hour'] = df['date_time'].dt.hour
    df['energy'] = df.loc[df['energy(kWh/hh)'] != "Null"]['energy(kWh/hh)'].astype('float')
    df = df.groupby(['LCLid', 'hour'])['energy'].mean().reset_index()
    dfs += [df]
df1 = pd.concat(dfs)

block_71.csv
block_65.csv
block_59.csv
block_58.csv
block_64.csv
block_70.csv
block_99.csv
block_66.csv
block_72.csv
block_73.csv
block_67.csv
block_98.csv
block_88.csv
block_63.csv
block_77.csv
block_76.csv
block_62.csv
block_89.csv
block_48.csv
block_74.csv
block_60.csv
block_61.csv
block_75.csv
block_49.csv
block_108.csv


  interactivity=interactivity, compiler=compiler, result=result)


block_12.csv
block_4.csv
block_5.csv
block_13.csv
block_109.csv
block_11.csv
block_39.csv
block_7.csv
block_6.csv
block_38.csv
block_10.csv
block_28.csv
block_14.csv
block_2.csv
block_3.csv
block_15.csv
block_29.csv
block_17.csv
block_1.csv
block_0.csv
block_16.csv
block_101.csv
block_33.csv
block_27.csv
block_26.csv
block_32.csv
block_100.csv
block_102.csv
block_24.csv
block_30.csv
block_18.csv
block_19.csv
block_31.csv
block_25.csv
block_103.csv
block_107.csv
block_21.csv
block_35.csv
block_34.csv
block_20.csv
block_106.csv
block_110.csv
block_104.csv
block_36.csv
block_22.csv
block_8.csv
block_9.csv
block_23.csv
block_37.csv
block_105.csv
block_111.csv
block_93.csv
block_87.csv
block_50.csv
block_44.csv
block_78.csv
block_79.csv
block_45.csv
block_51.csv
block_86.csv
block_92.csv
block_84.csv
block_90.csv
block_47.csv
block_53.csv
block_52.csv
block_46.csv
block_91.csv
block_85.csv
block_81.csv
block_95.csv
block_42.csv
block_56.csv
block_57.csv
block_43.csv
block_94.csv
block_80.cs

In [10]:
len(df1.LCLid.unique())

5552

In [42]:
df_wide = df1.pivot(index='LCLid', columns='hour', values='energy') #.reset_index()
# Normalize to get patterns. 
for c in df_wide.columns:
    df_wide[c] = df_wide[c] / df_wide.max(axis=1)
df_wide.head()

hour,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
LCLid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
MAC000002,0.382725,0.372449,0.306173,0.2188,0.181331,0.171977,0.166044,0.17047,0.178086,0.289353,...,0.429403,0.38407,0.367304,0.435305,0.514899,0.560758,1.0,0.303803,0.303975,0.284902
MAC000003,0.935194,1.0,0.999992,1.0,1.0,1.0,1.0,0.41582,0.23709,0.224426,...,0.173303,0.148795,0.14223,0.154689,0.207672,0.21309,0.204557,0.174,0.16823,0.152967
MAC000004,0.620506,0.054794,0.072006,0.052707,0.060302,0.061505,0.058374,0.065098,0.086682,0.092085,...,0.05342,0.061346,0.061003,0.067898,0.061637,0.062746,0.072786,0.081372,0.05848,0.060408
MAC000005,0.135091,0.08725,0.085931,0.085399,0.084195,0.084219,0.094494,0.167558,0.615636,0.212043,...,0.221576,0.215997,0.260174,0.425803,0.575295,0.302287,0.234304,0.263129,0.288133,0.177359
MAC000006,0.206978,0.076273,0.075481,0.072986,0.071838,0.082649,0.08954,0.254878,0.288084,0.343736,...,0.084487,0.095074,0.111528,0.147726,0.23257,0.250907,0.235551,0.148847,0.103635,0.081601


In [43]:
km = KMeans(n_clusters=6, random_state=0)  # 6 used by CUB analysis

In [45]:
df_wide = df_wide.dropna()
km.fit(df_wide)
df_wide = df_wide.reset_index()
df_wide['cluster'] = km.labels_
df_wide.cluster.value_counts()

3    1347
1    1307
5     862
0     841
2     838
4     340
Name: cluster, dtype: int64

In [59]:
df_hh = pd.read_csv(Path(REPO_DIR, 'data/raw/smart-meters-in-london/informations_households.csv'))
data = pd.merge(df_wide[['LCLid', 'cluster']], df_hh)
pd.crosstab(data.Acorn_grouped, data.cluster)

cluster,0,1,2,3,4,5
Acorn_grouped,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ACORN-,0,0,1,0,0,1
ACORN-U,7,7,10,12,5,5
Adversity,344,424,181,507,53,297
Affluent,335,476,416,454,195,306
Comfortable,155,400,230,374,87,253


In [66]:
data = df_wide.melt(id_vars=['LCLid', 'cluster'], value_name='energy')
data = pd.merge(data, df_hh)

In [63]:
var = 'Acorn_grouped'
d = data.groupby([var, 'hour'])['energy'].mean().reset_index()
fig = px.line(d, x="hour", y="energy", color=var)
fig.show()

In [67]:
var = 'cluster'
d = data.groupby([var, 'hour'])['energy'].mean().reset_index()
fig = px.line(d, x="hour", y="energy", color=var)
fig.show()

In [49]:
data = pd.merge(df1, df_hh)
data = pd.merge(data, df_wide[['LCLid', 'cluster']])
data.head()

Unnamed: 0,LCLid,hour,energy,stdorToU,Acorn,Acorn_grouped,file,cluster
0,MAC000027,0,0.221333,Std,ACORN-J,Comfortable,block_71,0
1,MAC000027,1,0.181566,Std,ACORN-J,Comfortable,block_71,0
2,MAC000027,2,0.152639,Std,ACORN-J,Comfortable,block_71,0
3,MAC000027,3,0.135844,Std,ACORN-J,Comfortable,block_71,0
4,MAC000027,4,0.126713,Std,ACORN-J,Comfortable,block_71,0


In [50]:
var = 'Acorn_grouped'
d = data.groupby([var, 'hour'])['energy'].mean().reset_index()
fig = px.line(d, x="hour", y="energy", color=var)
fig.show()

In [51]:
var = 'cluster'
d = data.groupby([var, 'hour'])['energy'].mean().reset_index()
fig = px.line(d, x="hour", y="energy", color=var)
fig.show()