# Imports
## import packages

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import datetime as dt
import os
from matplotlib import pyplot as plt
from sklearn import linear_model

## import scripts

In [3]:
# location of data
file = '../../data/tacoma.csv'

data = pd.read_csv(file)
data = pd.melt(data, id_vars=['date', 'high', 'low', 'tdelta'])
data.columns = ['date', 'high', 'low', 'tdelta', 'time', 'load']
data['date'] = data['date'].astype(str) + ' ' + data['time'].astype(str)
data = data.drop('time', axis=1)


data['date'] = pd.to_datetime(data['date'], errors='coerce')
data.sort_values(by='date', inplace = True)

data['date']
data.set_index('date', inplace=True)

# time of day
data['hour'] = data.index.hour

# isoweek number
data['dates'] = data.index.date
data['iso_week'] = data.dates.apply(lambda x: x.isocalendar()[1])
data.drop('dates', axis=1, inplace=True)

# 0 = monday, 6 = sunday
data['dayofweek'] = data.index.dayofweek
#dropna
data.dropna(inplace=True)
data['temp_avg'] = data[['high', 'low']].mean(axis=1)

## Add daytype, dayofyear, logload

In [5]:
# set values for daytype
conditions = [
    # non workday
    (data['dayofweek'].astype(float) == 5),
    (data['dayofweek'].astype(float) == 6),
    
    # day before non workday
    (data['dayofweek'].astype(float) == (4)),
    
    # day after non workday
    (data['dayofweek'].astype(float) == (0))]

choices = [0, 0, 1, 2]
data['day_type'] = np.select(conditions, choices, default='-1')

# add columns for dayofyear and logload
data['dayofyear'] = data.index.dayofyear
data['logload'] = np.log(data.load)

In [6]:
# write data to file
data.to_csv('pydata.csv', index='date', header=True)

In [7]:
# split train and test data
# only iso week 53 is 2016
train = data['2013':'2017']

#, data['2014-01'], data['2015-01'], data['2016-01'],data['2017-01']
test = data['2018']
train_load = train.load.to_frame()
test_load = test.load.to_frame()