In [150]:
import pandas as pd
import numpy as np
from datetime import datetime
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

df = pd.read_csv('merged_data.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,DATE,Total Power (max),Total Power (min),Total Power (avg),Total Power (samp),Total Power (trimmed),HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,...,FG,FZ,GS,HZ,MI,PL,RA,SN,TS,VCTS
0,0,2019-03-01 00:00:00,0.0,0.0,0.0,0.0,0.0,9.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,2019-03-01 01:00:00,0.0,0.0,0.0,0.0,0.0,9.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,2019-03-01 02:00:00,0.0,0.0,0.0,0.0,0.0,9.0,17.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3,2019-03-01 03:00:00,0.0,0.0,0.0,0.0,0.0,10.0,19.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2019-03-01 04:00:00,0.0,0.0,0.0,0.0,0.0,8.0,16.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [151]:
## Selecting columns of interest to us.
filtered_cols = [
    'DATE',
    'Total Power (trimmed)',
    'HourlyDryBulbTemperature',
    'HourlyWetBulbTemperature'
]

## Making a new dataframe only with these columns of interest.
hourly = df[filtered_cols]
#hourly['DATE'] = hourly['DATE'].astype(str)

In [152]:
hourly["DATE"] = hourly.apply(lambda x: pd.Timestamp(x["DATE"]), axis=1)

colList = hourly.copy().columns[:-1] 

groupby = hourly

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [153]:
## Add h-1 to h-n and h+1 to h+24, current time, DoW, and month
# max power, avg power?, dry bulb temp, humidity
# Row: h0 power, h0 weather, h- power, h- weather, h+ weather, (h+ power) together
# features should be built on information from the last several hours

def windowData(df, npast=0, nfuture=0, colpast=[], colfuture=[]):
    # init return variable
    outdf = df.copy()
    
    # add past windows
    kept_past = df[colpast]  # elements used past windows
    for i in range(1,npast+1):
        temp = kept_past.shift(i)
        temp.columns = [f"{c}-{i}hr" for c in kept_past.columns]
        outdf = pd.concat([outdf,temp],axis=1)
        
    # add future windows
    kept_future = df[colfuture]  # elements used future windows
    for i in range(1,nfuture+1):
        temp = kept_future.shift(-i)
        temp.columns = [f"{c}+{i}hr" for c in kept_future.columns]
        outdf = pd.concat([outdf,temp],axis=1)
    
    return outdf



In [154]:
past_features = ['Total Power (trimmed)', 'HourlyDryBulbTemperature', 'HourlyWetBulbTemperature']
future_features = ['HourlyDryBulbTemperature', 'HourlyWetBulbTemperature']

hourlyWindowed = windowData(hourly, npast=12, nfuture=24, colpast=past_features, colfuture=future_features)

#hourlyWindowed = hourlyWindowed.dropna()

In [155]:
## Adding TS features (later)

#Adding day of week, day of year, time of day measures
## ** I think these are categorical and should be represented as dummy variables **
## implemented with cyclicality represented by sin and cos
months = hourly.apply(lambda x: x["DATE"].month, axis=1)
hourlyWindowed['month_sin'] = np.sin((months-1)*(2.*np.pi/12))
hourlyWindowed['month_cos'] = np.cos((months-1)*(2.*np.pi/12))

dow = hourly.apply(lambda x: x["DATE"].dayofweek, axis=1)
hourlyWindowed['DoW_sin'] = np.sin(dow*(2.*np.pi/7))
hourlyWindowed['DoW_cos'] = np.cos(dow*(2.*np.pi/7))
    
hr = hourly.apply(lambda x: x["DATE"].hour, axis=1)
hourlyWindowed['Hour_sin'] = np.sin(hr*(2.*np.pi/24))
hourlyWindowed['Hour_cos'] = np.cos(hr*(2.*np.pi/24))


In [156]:
## build train and test (2 sets, one time-seperated, one random)
# Cutoff date between train and test data (CV train set)
# also worth trying random sampling

#response_col = 'Total Power (trimmed)'

#def split(df, perc_split=0.8, response=response_col, shuffled=False):
#    # Shuffled determines if split should be random (True) or sequential (False)
#    
#    X = df.drop(columns = ['DATE', f"{response}"])  # predictors are all features except response and date
#    Y = df[response_col]  # response is as specified in argument
#    
#    return train_test_split(X, Y, train_size = perc_split, random_state=12345, shuffle=shuffled)

In [169]:
def genXY(X, target_var, maxoffset=1):
    '''
    Generate X and Y data to be used in run model. X should be untrimmed (include NaNs).
    
    Will return tuple contianing trimmed X and list of Y vectors with offset ranging from 1 to maxoffset.
    '''
    df = X.copy()
    Y = []
    
    for l in range(1,maxoffset+1):
        df[f"offset_{l}"] = df[target_var].shift(-l)
    
    df = df.dropna()
    df = df.reset_index(drop=True)
    df = df.drop(columns=['DATE'])
    
    for l in range(1,maxoffset+1):
        Y.append(df[f"offset_{l}"])
        df.drop(columns=[f"offset_{l}"], inplace=True)
    
    return (df, Y)

In [170]:
X, Ys = genXY(hourlyWindowed, 'Total Power (trimmed)', maxoffset=3)

In [171]:
## Make and run model
def runModel(X, Y, model, verbose=False):

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.8, random_state=12345, shuffle=False)  # create train and test sets

    model.fit(X_train, Y_train)  # fit model
    
    # Calculate error metrics
    rsq = model.score(X_test, Y_test)
    rmse = math.sqrt(metrics.mean_squared_error(Y_test, model.predict(X_test)))
    mae = metrics.mean_absolute_error(Y_test, model.predict(X_test))
    
    # Display if selected in arguments
    if verbose:
        print("R Squared Score: {:.4f}".format(rsq))
        print("Root Mean Squared Error: {:.2f}".format(rmse))
        print("Mean Absolute Error: {:.2f}".format(mae))

    #importance = pd.DataFrame({'col_name': rf.feature_importances_}, index=X.columns).sort_values(by='col_name', ascending=False)
    #importance['col_name'] = 100*importance['col_name']
    #print(importance[:10])
    
    return model, mae, rmse, rsq

In [172]:
runModel(X=X, Y=Ys[0], model= RandomForestRegressor(), verbose=True)

R Squared Score: 0.9665
Root Mean Squared Error: 354.90
Mean Absolute Error: 197.38


(RandomForestRegressor(),
 197.3845032682265,
 354.9027453158792,
 0.9665024850309276)

In [175]:
def pltErrors(errorDict,model=""):
    plt.plot(errorDict['ndays'], errorDict['MAE'])
    plt.plot(errorDict['ndays'], errorDict['RMSE'])
    
    plt.legend(['MAE', 'RMSE'])
    plt.xlabel('Hours in the future')
    plt.ylabel('Model Error')
    plt.title(f'Future Prediction Error {model}')
    plt.ylim(ymin=0, ymax =3000)
    xint = range(min(errorDict['ndays']), math.ceil(max(errorDict['ndays']))+1)
    plt.xticks(xint)
    
    plt.show()

In [176]:
def testErrors(X, Ys, model):
    errors = {'ndays':list(range(1,len(Ys)+1)),'MAE':[], 'RMSE':[], 'RSQ':[]}
    for Y in Ys:
        _, mae, rmse, rsq = runModel(X, Y, model)
        errors['MAE'].append(mae)
        errors['RMSE'].append(rmse)
        errors['RSQ'].append(rsq)
        
        
    pltErrors(errors, model)

In [178]:
testErrors(X=X, Ys=Ys[0], model= RandomForestRegressor())

TypeError: Singleton array array(0.) cannot be considered a valid collection.

In [None]:
## Questions:
## Can we improve by creating dummy variables for time of day?
## Then incrimenting an hour of day input parameter by one for each prediction?