In [56]:
import pandas as pd
import numpy as np
from numpy import nan
from numpy import isnan
from numpy import split
from numpy import array
from pandas import to_numeric


from sklearn.metrics import mean_squared_error
from matplotlib import pyplot
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import HuberRegressor
from sklearn.linear_model import Lars
from sklearn.linear_model import LassoLars
from sklearn.linear_model import PassiveAggressiveRegressor
from sklearn.linear_model import RANSACRegressor
from sklearn.linear_model import SGDRegressor



In [41]:
dataset = pd.read_csv('household_power_consumption.txt', sep=';', header=0, 
                      low_memory=False, infer_datetime_format=True, 
                      parse_dates={'datetime':[0,1]}, index_col=['datetime'])

In [73]:
print(dataset.shape)
dataset.head()


(2075259, 7)


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16 17:24:00,4.216,0.418,234.839996,18.4,0.0,1.0,17.0
2006-12-16 17:25:00,5.36,0.436,233.630005,23.0,0.0,1.0,16.0
2006-12-16 17:26:00,5.374,0.498,233.289993,23.0,0.0,2.0,17.0
2006-12-16 17:27:00,5.388,0.502,233.740005,23.0,0.0,1.0,17.0
2006-12-16 17:28:00,3.666,0.528,235.679993,15.8,0.0,1.0,17.0


In [74]:
dailyGroups = dataset.resample('D')
dailyData = dailyGroups.sum()
print(dailyData.shape)
dailyData.head()


(1442, 7)


Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16,1209.176025,34.922001,93552.53125,5180.799805,0.0,546.0,4926.0
2006-12-17,3390.459961,226.005997,345725.3125,14398.599609,2033.0,4187.0,13341.0
2006-12-18,2203.825928,161.792007,347373.625,9247.200195,1063.0,2621.0,14018.0
2006-12-19,1666.19397,150.942001,348479.0,7094.0,839.0,7602.0,6197.0
2006-12-20,2225.748047,160.998001,348923.625,9313.0,0.0,2648.0,14063.0


In [75]:
# this is based on per minute data
def fillMissingValues(values):
    prevDaySameTimeOffset = 60*24
    for row in range(values.shape[0]):
        for col in range(values.shape[1]):
            if isnan(values[row,col]):
                values[row,col] = values[row- prevDaySameTimeOffset, col]

def splitDataSet(data):
    train, test = data[1:-328], data[-328:-6]
    
    train = array(split(train,len(train)/7))
    test = array(split(test, len(test)/7))
    return train, test

def evaluateForcast(actual, predicted):
    scores = list()
    
    for i in range(actual.shape[1]):
        mse = mean_squared_error(actual[:,i], predicted[:,i])
        
        rmse = sqrt(mse)
        
        scores.append(rmse)
        
    # calculate overal RMSE
    s= 0 
    for row in range(actual.shape[0]):
        for col in shape(actual.shape[1]):
            s+= (actual[row,col] - predicted[row,col])**2
    
    score = sqrt(s/(actual.shape[0]* actual.shape[1]))
    return score, scores
        

#convert history into input and output
def toSupervised(hisrory, outputIdx):
    X,y = list(), list()
    
    for i in range(len(history)-1):
        X.append(history[i][:,0])
        y.append(history[i+1][outputIdx,0])
    return array(x), array(y)

def sklearnModelPredict(model, history):
    yHatSeq = list()
    
    #fit a model for each forecast day
    for i in range(7):
        #prepare data
        trainX, trainY = toSupervised(history,i)
        
        
        #make pipeline
        pipeline = makePipeLine(model)
        
        pipeline.fit(trainX, trainY)
        
        #forcast
        xInput = array(trainX[-1,:]).reshape(1,7)
        yHat = pipeline.predict(xInput)[0]
        
        # store
        yHatSeq.append(yHat)
    return yHatSeq
    
    
def makePipeline(model):
    steps = list()
    
    steps.append(('standardize', StandardScaler))
    steps.append(('normalize', MinMaxScaler))
    steps.append(('model', model))
    
    pipeline = Pipeline(steps= steps)
    return pipeline


def mlModels():
    models=dict()
    
    # linear models
    models['lr'] = LinearRegression()
    models['lasso'] = Lasso()
    models['ridge'] = Ridge()

    return models

In [None]:
#     models['en'] = ElasticNet()
#     models['huber'] = HuberRegressor()
#     models['lars'] = Lars()
#     models['llars'] = LassoLars()
#     models['pa'] = PassiveAggressiveRegressor(max_iter=1000, tol=1e-3)
#     models['ranscac'] = RANSACRegressor()
#     models['sgd'] = SGDRegressor(max_iter=1000, tol=1e-3)
#     print('Defined %d models' % len(models))

In [77]:
dailyData.to_csv('household_power_consumption_days.csv')
dailyDataSet = pd.read_csv('household_power_consumption_days.csv', header=0, infer_datetime_format= True, 
                           parse_dates=['datetime'], index_col=['datetime'] )
dailyDataSet.head()

Unnamed: 0_level_0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Sub_metering_1,Sub_metering_2,Sub_metering_3
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2006-12-16,1209.176,34.922,93552.53,5180.8,0.0,546.0,4926.0
2006-12-17,3390.46,226.006,345725.3,14398.6,2033.0,4187.0,13341.0
2006-12-18,2203.826,161.792,347373.62,9247.2,1063.0,2621.0,14018.0
2006-12-19,1666.194,150.942,348479.0,7094.0,839.0,7602.0,6197.0
2006-12-20,2225.748,160.998,348923.62,9313.0,0.0,2648.0,14063.0


In [80]:
train, test = splitDataSet(dailyDataSet.values)
print(train.shape)
print(test.shape)
train[:1]

(159, 7, 7)
(46, 7, 7)


array([[[3.3904600e+03, 2.2600600e+02, 3.4572530e+05, 1.4398600e+04,
         2.0330000e+03, 4.1870000e+03, 1.3341000e+04],
        [2.2038260e+03, 1.6179200e+02, 3.4737362e+05, 9.2472000e+03,
         1.0630000e+03, 2.6210000e+03, 1.4018000e+04],
        [1.6661940e+03, 1.5094200e+02, 3.4847900e+05, 7.0940000e+03,
         8.3900000e+02, 7.6020000e+03, 6.1970000e+03],
        [2.2257480e+03, 1.6099800e+02, 3.4892362e+05, 9.3130000e+03,
         0.0000000e+00, 2.6480000e+03, 1.4063000e+04],
        [1.7232880e+03, 1.4443400e+02, 3.4709640e+05, 7.2664000e+03,
         1.7650000e+03, 2.6920000e+03, 1.0456000e+04],
        [2.3413380e+03, 1.8690600e+02, 3.4730575e+05, 9.8970000e+03,
         3.1510000e+03, 3.5000000e+02, 1.1131000e+04],
        [4.7733860e+03, 2.2147000e+02, 3.4579594e+05, 2.0200400e+04,
         2.6690000e+03, 4.2500000e+02, 1.4726000e+04]]])

In [71]:
# mark all missing values
dataset.replace('?', nan, inplace=True)
# make dataset numeric
dataset = dataset.astype('float32')
fillMissingValues(dataset.values)

dataset.to_csv('household_power_consumption_modified.csv')


In [61]:
"""If True and parse_dates is enabled, pandas will attempt to infer the format of the datetime strings in the columns, 
and if it can be inferred, switch to a faster method of parsing them. In some cases this can increase the parsing speed"""

dataSet = pd.read_csv('household_power_consumption_days.csv', header=0, infer_datetime_format=True, 
                      parse_dates=['datetime'], index_col=['datetime'] )
trainData, testData = splitDataSet(dataSet.values)
models = mlModels()

for name, model in models.items():
    
    score, scores = evaluateModel(model, trainData, testData)
    
    summarizedScores(name, score, scores)
    
    
    

Defined 10 models


NameError: name 'evaluateModel' is not defined