# Construct Training data

After data investigation, we would like to construct training data with things we have learned from data investigation. Let's pull up all information and data we need.

In [179]:
#specify data source, zipcode (along with latitude, longitue, and timezone), and bin values to calculate mean.
datasource = 'L'
#[zipcode, latitude, longitude, localtz] = ['08640', 40.0039, -74.6178, 'US/Eastern']
#[zipcode, latitude, longitude, localtz] = ['08641', 40.0449, -74.5892, 'US/Eastern']
#this vin values serves only NJ 
#binvalue = [0.60000000000000009, 1.5531428571428569, 2.5209606741573034, 4.1015736906211933, 7.487476098503139, 12.943637931034482]
#[zipcode, latitude, longitude, localtz] = ['92562', 33.5686, -117.2530, 'US/Pacific']
[zipcode, latitude, longitude, localtz] = ['92563', 33.5712, -117.1540, 'US/Pacific']
#this bin values serves only CA
binvalue = [0.6366666666666666, 1.5599424405242481, 2.4851360650682972, 3.9649782737139554, 6.7641328892898853, 12.809405655777667]

In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import pytz
import os
import patsy #for spline regression
import scipy #for non-negative least square 
import scipy as sp 
from scipy import stats
from scipy.optimize import nnls
from numpy.linalg import inv #for matrix and statistics
import scipy as sp
import math
import statsmodels.api as sm
import random
%matplotlib inline

In [181]:
#mainDir = 'C:/Users/Tee/Dropbox/Active/EnergyProject/Thesis/PVreadingsStudies/main'
mainDir = 'C:\Users\Admin\Dropbox\Active\EnergyProject\Thesis'
#read data and metadata
metadata = pd.read_csv(mainDir+ '/data/solar/' + datasource +'/' + zipcode +'/metadata.csv',
                             dtype={'componentId':'object','zip':'object'})
data = pd.read_csv(mainDir+ '/data/solar/' + datasource +'/'+ zipcode+'/data.csv',
                             dtype={'componentId':'object'})
#convert timestamp string to datetime format if needed
#metadata['FirstTimestamp'] = metadata['FirstTimestamp'].map(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))
#data['tsLocal'] = data['tsLocal'].map(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))

In [182]:
#size assignment according to buckets
metadata['size'] = float('NaN')
metadata.loc[metadata.sizeBucket == '0-1 kW','size'] = binvalue[0]
metadata.loc[metadata.sizeBucket == '1-2 kW','size'] = binvalue[1]
metadata.loc[metadata.sizeBucket == '2-3 kW','size'] = binvalue[2]
metadata.loc[metadata.sizeBucket == '3-5 kW','size'] = binvalue[3]
metadata.loc[metadata.sizeBucket == '5-10 kW','size'] = binvalue[4]
metadata.loc[metadata.sizeBucket == '10-20 kW','size'] = binvalue[5]

In [183]:
#select only residential components
metadata = metadata[metadata['size'] <= 20]
data = data[['tsLocal','altitude','azimuth']+list(metadata['componentId'])]

The way to process data is similar to function data crunch. However, we now impose condition on number of components. At each timestamp, if the number of components that provide power readings are less than 50 components, we report 'NaN' as a value at the timestamp. Otherwise, we use the sum of power readings over total size of components that give the sum as a value at the timestamp.

In [184]:
dat = data
dat = dat.set_index(['tsLocal','altitude','azimuth'])
presencecheck = dat.notnull()
dat['numactivecomp'] = presencecheck.sum(axis=1)
dat['totalpower'] = dat.sum(axis=1)
for i in presencecheck.columns.values:
    presencecheck[i] = presencecheck[i]*metadata[metadata.componentId == i]['size'].iloc[0]
dat['totalsize'] = presencecheck.sum(axis=1)
dat['power'] = dat['totalpower']/dat['totalsize']
dat['power'][dat.numactivecomp<50] = float('NaN')
dat.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Let's save both processed power readings data and relavent metadata.

In [185]:
directory = mainDir + '/data/solar/' + datasource +'/' + zipcode
metadata.to_csv(directory+"/training_metadata.csv",index=False)
dat[['tsLocal','altitude','azimuth','power','numactivecomp','totalsize']].to_csv(directory+"/training_data.csv",index=False)

## Training data for maximum profile validation

Here we use NJ data to validate maximum profile by running two set of training data.

In [186]:
#specify data source, zipcode (along with latitude, longitue, and timezone), and bin values to calculate mean.
#datasource = 'LocusEnergy'

#[zipcode, latitude, longitude, localtz] = ['08641', 40.0449, -74.5892, 'US/Eastern']
#this vin values serves only NJ 
#binvalue = [0.60000000000000009, 1.5531428571428569, 2.5209606741573034, 4.1015736906211933, 7.487476098503139, 12.943637931034482]

In [187]:
#mainDir = 'C:/Users/Tee/Dropbox/Active/EnergyProject/Thesis/PVreadingsStudies/main'
mainDir = 'C:\Users\Admin\Dropbox\Active\EnergyProject\Thesis'
#read data and metadata
metadata = pd.read_csv(mainDir+ '/data/solar/' + datasource +'/' + zipcode +'/metadata.csv',
                             dtype={'componentId':'object','zip':'object'})
data = pd.read_csv(mainDir+ '/data/solar/' + datasource +'/'+ zipcode+'/data.csv',
                             dtype={'componentId':'object'})
#convert timestamp string to datetime format if needed
#metadata['FirstTimestamp'] = metadata['FirstTimestamp'].map(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))
#data['tsLocal'] = data['tsLocal'].map(lambda x: datetime.datetime.strptime(x,"%Y-%m-%d %H:%M:%S"))

In [188]:
#size assignment according to buckets
metadata['size'] = float('NaN')
metadata.loc[metadata.sizeBucket == '0-1 kW','size'] = binvalue[0]
metadata.loc[metadata.sizeBucket == '1-2 kW','size'] = binvalue[1]
metadata.loc[metadata.sizeBucket == '2-3 kW','size'] = binvalue[2]
metadata.loc[metadata.sizeBucket == '3-5 kW','size'] = binvalue[3]
metadata.loc[metadata.sizeBucket == '5-10 kW','size'] = binvalue[4]
metadata.loc[metadata.sizeBucket == '10-20 kW','size'] = binvalue[5]

Split metadata file by half to construct training data 1 and 2

In [189]:
metadata1 = metadata.loc[::2,:]
metadata2 = metadata.loc[1::2,:]

In [190]:
metadata = metadata1

In [191]:
data = pd.read_csv(mainDir+ '/data/solar/' + datasource +'/'+ zipcode+'/data.csv',
                             dtype={'componentId':'object'})

In [192]:
#select only residential components
metadata = metadata[metadata['size'] <= 20]
data = data[['tsLocal','altitude','azimuth']+list(metadata['componentId'])]
dat = data
dat = dat.set_index(['tsLocal','altitude','azimuth'])
presencecheck = dat.notnull()
dat['numactivecomp'] = presencecheck.sum(axis=1)
dat['totalpower'] = dat.sum(axis=1)
for i in presencecheck.columns.values:
    presencecheck[i] = presencecheck[i]*metadata[metadata.componentId == i]['size'].iloc[0]
dat['totalsize'] = presencecheck.sum(axis=1)
dat['power'] = dat['totalpower']/dat['totalsize']
dat['power'][dat.numactivecomp<50] = float('NaN')
dat.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [193]:
directory = mainDir + '/data/solar/' + datasource +'/' + zipcode
metadata.to_csv(directory+"/training_metadata_1.csv",index=False)
dat[['tsLocal','altitude','azimuth','power','numactivecomp','totalsize']].to_csv(directory+"/training_data_1.csv",index=False)

In [194]:
metadata = metadata2

In [195]:
data = pd.read_csv(mainDir+ '/data/solar/' + datasource +'/'+ zipcode+'/data.csv',
                             dtype={'componentId':'object'})

In [196]:
#select only residential components
metadata = metadata[metadata['size'] <= 20]
data = data[['tsLocal','altitude','azimuth']+list(metadata['componentId'])]
dat = data
dat = dat.set_index(['tsLocal','altitude','azimuth'])
presencecheck = dat.notnull()
dat['numactivecomp'] = presencecheck.sum(axis=1)
dat['totalpower'] = dat.sum(axis=1)
for i in presencecheck.columns.values:
    presencecheck[i] = presencecheck[i]*metadata[metadata.componentId == i]['size'].iloc[0]
dat['totalsize'] = presencecheck.sum(axis=1)
dat['power'] = dat['totalpower']/dat['totalsize']
dat['power'][dat.numactivecomp<50] = float('NaN')
dat.reset_index(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [197]:
directory = mainDir + '/data/solar/' + datasource +'/' + zipcode
metadata.to_csv(directory+"/training_metadata_2.csv",index=False)
dat[['tsLocal','altitude','azimuth','power','numactivecomp','totalsize']].to_csv(directory+"/training_data_2.csv",index=False)

In [198]:
directory

'C:\\Users\\Admin\\Dropbox\\Active\\EnergyProject\\Thesis\\PVreadingsStudies\\main/data/LocusEnergy/92563'