In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

df = pd.read_csv('merged_data.csv')
df.head()

In [None]:
## Selecting columns of interest to us.
filtered_cols = [
    'DATE',
    'Total Power (max)',
    'Total Power (avg)',
    'HourlyDryBulbTemperature',
    'HourlyWetBulbTemperature',
    'HourlyRelativeHumidity'
]

## Making a new dataframe only with these columns of interest.
hourly = df[filtered_cols]
hourly['DATE'] = hourly['DATE'].astype(str)

In [None]:
## Reformatting datetime object to make new date column...not working for me.
dates = []
for i, row in hourly.iterrows():
    dates.append(datetime.strptime(row['DATE'], '%m/%d/%Y %H:%M').date())
    
    for column in hourly.columns[1:]:
        try:
            hourly.at[i, column] = hourly.at[i, column].astype(float)
        except:
            hourly.at[i, column] = 0
            
daily = hourly.drop(columns = ['DATE']).copy()
daily['date'] = dates

colList = daily.copy().columns[:-1] 

In [None]:
## Doing a groupby to get the summary data for each day...including the max, the variable of interest.
groupby = daily.groupby('date').describe()
groupby.head()

In [None]:
## Function to get the weekdays and create a new column indicating whether or not a particular day is a weekday.
def getWeekdays(df, datecol):
    weekdays = []
    for i, row in df.iterrows():
        weekdays.append(row[datecol].weekday())

    df['weekday'] = weekdays

In [None]:
## Function to run the model, takes a dataframe, model, and by default sets the 'predictorVar' (the response var?)
## to 'Total Power (max)', we must drop the date and total power (max) columns for it to work? Maybe this will become clear after reset daily.
def runModel(df, model, predictorVar = 'Total Power (max)', 
             dropCols = ['date', 'Total Power (max)']):
    X = df.drop(columns = dropCols)
    Y = df[predictorVar]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=12345)
    ## I don't understand the train test split happening and how it is in intervals...
    
    #print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

    #rf = RandomForestRegressor()
    model.fit(X_train, Y_train)
    
    rsq = model.score(X_test, Y_test)
    rmse = math.sqrt(metrics.mean_squared_error(Y_test, model.predict(X_test)))
    mae = metrics.mean_absolute_error(Y_test, model.predict(X_test))
    
    print("R Squared Score: {:.4f}".format(rsq))
    print("Root Mean Squared Error: {:.2f}".format(rmse))
    print("Mean Absolute Error: {:.2f}".format(mae))

    #importance = pd.DataFrame({'col_name': rf.feature_importances_}, index=X.columns).sort_values(by='col_name', ascending=False)
    #importance['col_name'] = 100*importance['col_name']
    #print(importance[:10])
    
    return mae, rmse, rsq

In [None]:
## The goal of resetDaily is to reset the window from which we are predicting.
## Inputs: groupbyDF---the dataframe grouped by day, collist---list of columns in groupby, minus the date column?
## ndays---the number of days we have to look ahead, trainDays---the number of days to look back, model---the kind of model we want to run.
def resetDaily(groupbyDF = groupby, collist = colList, ndays = 0, trainDays = 5, 
               model = RandomForestRegressor()):
    parameters = ['min', 'max', 'mean'] # We are looking at min, max and mean as our predictors in the data looking BACK
    dailyCols = ['date'] # why?
    dailyData = {'date':list(groupby.index)} # Create a dictionary, key is date, values are list of data

    # Reformatting the data to grab what we want, the max power for the day, the mean power for the day
    for column in collist:
        if(column == 'Total Power (max)'):
            dailyData['Total Power (max)'] = list(groupby[column]['max'])
        elif(column == 'Total Power (avg)'):
            dailyData['Total Power (avg)'] = list(groupby[column]['mean'])
        else:                                     
            for param in parameters: # IDK WHAT THIS PART DOES, is it just a catch all?
                dailyData[param + "_" + column] = list(groupby[column][param])

    daily = pd.DataFrame(dailyData) # Creating a dataframe from the dictionary 'dailyData'
    
    # Define a list of columns we are interested in for LOOKING BACK.
    nday_cols = [
        'mean_HourlyDryBulbTemperature',
        'mean_HourlyWetBulbTemperature',
        'max_HourlyDryBulbTemperature',
        'max_HourlyWetBulbTemperature',
        'Total Power (avg)', 'Total Power (max)']
    
    if(ndays > 0):
        for col in nday_cols[:-2]: # except total power (avg), and total power (max)?
            for i in range(ndays + trainDays): # create several new columns, for one day back, two days back, three days back, etc.
                colname = col + "-" + str(i + 1) + "day"
                daily[colname] = daily[col].shift(i+1) 
        
        # I don't get this part?
        for col in nday_cols[-2:]: # for avg total power and max total power only...
            for i in range(ndays, trainDays + ndays):
                colname = col + "-" + str(i) + "day" # looks ahead?? or looks back the appropriate amount of days?
                daily[colname] = daily[col].shift(i)
        
    daily = daily.dropna().drop(columns = ['Total Power (avg)']) # Drop this because it can't be a predictor
    
    getWeekdays(daily, 'date') # What does this do?
    
    mae, rmse, rsq = runModel(daily, model) # run the model on the new dataframe
    ## Is the model input random forest regressor?
    
    return daily, mae, rmse, rsq

In [None]:
def pltErrors(trainingDays,
             errorDict = {'ndays':[],'MAE':[], 'RMSE':[], 'RSQ':[]}):
    plt.plot(errorDict['ndays'], errorDict['MAE'])
    plt.plot(errorDict['ndays'], errorDict['RMSE'])

    plt.legend(['MAE', 'RMSE'])
    plt.xlabel('n_days')
    plt.ylabel('Model Error')
    plt.title('Train Days = ' + str(trainingDays+1))
    plt.ylim(ymin=0, ymax =3000)
    xint = range(min(errorDict['ndays']), math.ceil(max(errorDict['ndays']))+1)
    plt.xticks(xint)
    
    plt.show()

In [None]:
resetDaily(ndays = 5, trainDays = 10)[0].columns

In [None]:
def testErrors(model = RandomForestRegressor(), 
               trainDaysRange = 10, n_days= 5):
    for j in range(trainDaysRange):
        errors = {'ndays':[],'MAE':[], 'RMSE':[], 'RSQ':[]}
        for i in range(n_days):
            print("{} days:".format(i+1))
            daily, mae, rmse, rsq = resetDaily(model, 
                                               ndays = i+1, trainDays = j+1)
            print()

            errors['MAE'].append(mae)
            errors['RMSE'].append(rmse)
            errors['RSQ'].append(rsq)
            errors['ndays'].append(i+1)

        pltErrors(j, errors)

In [None]:
testErrors(model = RandomForestRegressor())

In [None]:
testErrors(model = GradientBoostingRegressor())

In [None]:
## Questions:
## Can we improve by creating dummy variables for time of day?
## Then incrimenting an hour of day input parameter by one for each prediction?