# Clustering Individual Household Electric Power Consumption and Future Consumption Regression Analysis.

Our group proposes to use the Individual household electric power consumption data set to look for power consumption trends over time. We plan on clustering the data using descriptive methods to discover patterns and trends. Applying predictive methods such as regression we plan to predict future power consumption.

Dataset: https://archive.ics.uci.edu/ml/machine-learning-databases/00235/household_power_consumption.zip

In [None]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt
from datetime import datetime
from numpy.linalg import norm
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# Preprocessing

## Process and clean the data
Process the data by reading each line, removing the column header information and stripping the semicolon seperators. Then convert the date and time stamps to numeric values and merge the two to have a dataset with all numeric values.

In [None]:
def time_to_ratio(time_stamp):
    time = datetime.strptime(time_stamp, '%d/%m/%Y %H:%M:%S')
    start = datetime(year=time.year, month=1, day=1)
    end = datetime(year=time.year+1, month=1, day=1)
    return (time - start).total_seconds()/(end - start).total_seconds()

def minutes_from_start(time_stamp, start_stamp):
    time = datetime.strptime(time_stamp, '%d/%m/%Y %H:%M:%S')
    start = datetime.strptime(start_stamp, '%d/%m/%Y %H:%M:%S')
    return (time - start).total_seconds()/60.0

In [None]:
# read data from text document
with open('household_power_consumption.txt', 'r', encoding='utf-8') as f:
    lines = [line.rstrip('\n') for line in f]

# Remove the '?' uncaptured data if detected
data_raw_reduced = [line for line in lines if '?' not in line] 

# strip the header information and remove semicolons     
data_raw = [l.split(';') for l in data_raw_reduced][1::]

# Convert date and time to a numeric value/ratio
time_ratios = [time_to_ratio(f'{t[0]} {t[1]}') for t in data_raw]

# merge time with raw data removing time stamp strings and replacing with ratios
data_time_raw = [[t, float(gap), float(grp), float(v), float(gi), float(s1), float(s2), float(s3)] for (_, _, gap, grp, v, gi, s1, s2, s3), (t) in zip(data_raw, time_ratios)]


In [None]:
# Verify columns/rows/data are as expected.
print("Number of rows: {}".format(len(data_time_raw)))
print("Number of columns: {}".format(len(data_time_raw[0])))
print(data_time_raw[:10])

# Convert to np array for better processing.
data_time_np = np.array(data_time_raw, dtype=float)
print("Number of rows: {}".format(data_time_np.shape[0]))
print("Number of columns: {}".format(data_time_np.shape[1]))
print(data_time_np[:10])


In [None]:
start_time = f'{data_raw[0][0]} {data_raw[0][1]}' # the first time stamp
time_from_start = [minutes_from_start(f'{t[0]} {t[1]}', start_time) for t in data_raw] # array of minutes from first time stamp

## Normalization

In [None]:
## Additional Preprocessing Steps here ##
global_power = data_time_np[:,1].copy() # global power - to check trends

# Normalize by max value in a column
for i in range(1, data_time_np.shape[1]):
    data_time_np[:,i] *= (1.0/data_time_np[:,i].max())


In [None]:
plt.figure(1)
plt.plot(time_from_start, global_power)
plt.figure(2)
plt.plot(time_from_start, data_time_np[:,1])

print(data_time_np[:10])

## Dimensionality Reduction

In [None]:
pca = PCA(n_components=4)
data_time_np_reduced = pca.fit_transform(data_time_np)

print(data_time_np_reduced.shape)

In [None]:
print(f'Explained Variance from 4 components: {pca.explained_variance_ratio_.sum()*100:.2f}%')

In [None]:
fig, axs = plt.subplots(8, figsize=(15,30))
for i in range(8):
    axs[i].plot(time_from_start, data_time_np[:,i])
    axs[i].set_title(f'feature {i}')

In [None]:
data_time_np_reduced = np.delete(data_time_np, [1,2,3,4], axis=1)

fig, axs = plt.subplots(4, figsize=(15,30))
for i in range(4):
    axs[i].plot(time_from_start, data_time_np_reduced[:,i])
    axs[i].set_title(f'feature {i}')

# Cluster Analysis