# Clustering Individual Household Electric Power Consumption and Future Consumption Regression Analysis.

Our group proposes to use the Individual household electric power consumption data set to look for power consumption trends over time. We plan on clustering the data using descriptive methods to discover patterns and trends. Applying predictive methods such as regression we plan to predict future power consumption.

Dataset: https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip

In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt
import math
from datetime import datetime
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from scipy.spatial.distance import euclidean
from scipy.spatial.distance import cityblock
from sklearn.cluster import KMeans, DBSCAN, OPTICS
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

# Preprocessing

## Process and clean the data
Process the data by reading each line, removing the column header information and stripping the semicolon seperators. Then convert the date and time stamps to numeric values and merge the two to have a dataset with all numeric values.

In [None]:
def time_to_ratio(time_stamp):
    time = datetime.strptime(time_stamp, '%d/%m/%Y %H:%M:%S')
    start = datetime(year=time.year, month=1, day=1)
    end = datetime(year=time.year+1, month=1, day=1)
    return (time - start).total_seconds()/(end - start).total_seconds()

def minutes_from_start(time_stamp, start_stamp):
    time = datetime.strptime(time_stamp, '%d/%m/%Y %H:%M:%S')
    start = datetime.strptime(start_stamp, '%d/%m/%Y %H:%M:%S')
    return (time - start).total_seconds()/60.0

def get_cluster_number(labels):
    return len(set(labels)) - (1 if -1 in labels else 0)

In [None]:
# read data from text document
with open('household_power_consumption.txt', 'r', encoding='utf-8') as f:
    lines = [line.rstrip('\n') for line in f]

# Remove the '?' uncaptured data if detected
data_raw_reduced = [line for line in lines if '?' not in line] 

# strip the header information and remove semicolons     
data_raw = [l.split(';') for l in data_raw_reduced][1::]

X_list = [[float(d[6]), float(d[7]), float(d[8])] for d in data_raw]

## Normalization

## Dimensionality Reduction

In [None]:
X = np.array(X_list)
for i in range(X.shape[1]):
    X[:,i] *= (1.0/X[:,i].max())

In [None]:
start_time = f'{data_raw[0][0]} {data_raw[0][1]}'
N = 1000
M = 5000
step = 10
time_from_start = [minutes_from_start(f'{d[0]} {d[1]}', start_time) for d in data_raw[N:M:step]]
plt.figure(figsize=(15,10))
plt.plot(time_from_start, X[N:M:step,1])

In [None]:
X_red = X[::5].copy()

X_red = X_red[~np.all(X_red == 0.0, axis=1)]
print(X_red.shape[0])
X_train, X_test = train_test_split(X_red, train_size=0.75, random_state=42)
print(X_train.shape[0])

In [None]:
# dis = []
# for row in X:
#     dis.append(cityblock(row, X[0]))
# dis = np.array(dis)
# dis = dis[dis < 0.02]
# print(dis.size)

In [None]:
fig = plt.figure(figsize=(15,15))
ax = plt.axes(projection='3d')
ax.scatter3D(X_train[:,0], X_train[:,1], X_train[:,2])

plt.show()

# Cluster Analysis

In [None]:
# dbscan = DBSCAN(eps=0.025, min_samples=200, metric='manhattan', algorithm='ball_tree', leaf_size=10000)
# dbscan.fit(X_red)
# labels = dbscan.labels_

optics = OPTICS(min_samples=200, max_eps=0.5, metric='manhattan', min_cluster_size=0.01,  n_jobs=-1, cluster_method='dbscan', eps=0.05)
optics.fit(X_train)
labels = optics.labels_

print(get_cluster_number(labels))
print(set(labels))

In [None]:
plt.figure(figsize=(15,15))
ax = plt.axes(projection='3d')

for l in set(labels):
    X_bylabel = X_train[labels == l]
    ax.scatter3D(X_bylabel[:,0], X_bylabel[:,1], X_bylabel[:,2])

plt.show()

In [None]:
for l in set(labels):
    print(f'Number of points in cluster {l}: {labels[labels == l].size}')

In [None]:
np.save(f'minsamp200maxeps0.5minclust0.01eps0.05.npy', labels)