In [1]:
import pandas as pd
import numpy as np
from datetime import datetime
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score, roc_auc_score
from sklearn import metrics
from sklearn.model_selection import train_test_split

df = pd.read_csv('merged_data.csv')
df.head()

Unnamed: 0,DATE,Total Power,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyPrecipitation,HourlyRelativeHumidity,HourlySkyConditions,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,...,FG,FZ,GS,HZ,MI,PL,RA,SN,TS,VCTS
0,3/1/19 0:00,0.0,9,17,0,70,OVC,29.64,10,15,...,0,0,0,0,0,0,0,0,0,0
1,3/1/19 1:00,0.0,9,17,0,70,OVC,29.63,10,15,...,0,0,0,0,0,0,0,0,0,0
2,3/1/19 2:00,0.0,9,17,0,70,OVC,29.62,10,15,...,0,0,0,0,0,0,0,0,0,0
3,3/1/19 3:00,0.0,10,19,0,68,OVC,29.63,10,17,...,0,0,0,0,0,0,0,0,0,0
4,3/1/19 4:00,0.0,8,16,0,71,SCT,29.63,10,14,...,0,0,0,0,0,0,0,0,0,0


In [2]:
for i, row in df.iterrows():
    for column in df.columns[1:]:
        try:
            df.at[i, column] = df.at[i, column].astype(float)
        except:
            df.at[i, column] = 0

In [3]:
filtered_cols = [
    'DATE',
    'Total Power',
    'HourlyDryBulbTemperature',
    'HourlyWetBulbTemperature',
    'HourlyRelativeHumidity'
]
hourly = df[filtered_cols]

In [4]:
dates = []
for i, row in hourly.iterrows():
    dates.append(datetime.strptime(row['DATE'], '%m/%d/%y %H:%M').date())

daily = hourly.drop(columns = ['DATE']).copy()
daily['date'] = dates
colList = daily.copy().columns[:-1]

groupby = daily.groupby('date').describe()
groupby.head()

Unnamed: 0_level_0,Total Power,Total Power,Total Power,Total Power,Total Power,Total Power,Total Power,Total Power,HourlyDryBulbTemperature,HourlyDryBulbTemperature,...,HourlyWetBulbTemperature,HourlyWetBulbTemperature,HourlyRelativeHumidity,HourlyRelativeHumidity,HourlyRelativeHumidity,HourlyRelativeHumidity,HourlyRelativeHumidity,HourlyRelativeHumidity,HourlyRelativeHumidity,HourlyRelativeHumidity
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2019-03-01,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,25.416667,...,28.0,31.0,24.0,66.625,10.503881,44.0,60.25,70.0,75.0,81.0
2019-03-02,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,30.583333,...,31.0,32.0,24.0,79.041667,6.772574,64.0,75.0,82.0,85.0,85.0
2019-03-03,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,27.75,...,26.0,27.0,24.0,70.666667,11.675566,51.0,59.5,75.0,78.75,85.0
2019-03-04,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,18.791667,...,18.25,23.0,24.0,64.291667,11.547554,50.0,54.75,61.5,71.75,84.0
2019-03-05,24.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,24.0,13.916667,...,14.0,15.0,24.0,61.916667,9.249755,47.0,53.5,63.5,70.0,74.0


In [5]:
def getWeekdays(df, datecol):
    weekdays = []
    for i, row in df.iterrows():
        weekdays.append(row[datecol].weekday())

    df['weekday'] = weekdays

In [6]:
def runModel(df, predictorVar = 'max_power', 
             dropCols = ['mean_power', 'date', 'max_power']):
    X = df.drop(columns = dropCols)
    Y = df[predictorVar]

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=12345)

    print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

    rf = RandomForestRegressor()
    rf.fit(X_train, Y_train)

    print("R Squared Score: {:.4f}".format(rf.score(X_test, Y_test)))
    print("Root Mean Squared Error: {:.2f}".format(math.sqrt(metrics.mean_squared_error(Y_test, rf.predict(X_test)))))
    print("Mean Absolute Error: {:.2f}".format(metrics.mean_absolute_error(Y_test, rf.predict(X_test))))

    #importance = pd.DataFrame({'col_name': rf.feature_importances_}, index=X.columns.sort_values(by='col_name', ascending=False)
    #importance['col_name'] = 100*importance['col_name']
    #print(importance.head())

In [7]:
def resetDaily(groupbyDF = groupby, collist = colList, ndays = 0):
    parameters = ['min', 'max', 'mean']
    dailyCols = ['date']
    dailyData = {'date':list(groupby.index)}

    for column in collist:
        if(column == 'Total Power'):
            dailyData['max_power'] = list(groupby[column]['max'])
            dailyData['mean_power'] = list(groupby[column]['mean'])
        else:
            for param in parameters:
                dailyData[param + "_" + column] = list(groupby[column][param])

    daily = pd.DataFrame(dailyData)
    
    nday_cols = [
        'mean_HourlyDryBulbTemperature',
        'mean_HourlyWetBulbTemperature',
        'max_HourlyDryBulbTemperature',
        'max_HourlyWetBulbTemperature',
        'mean_power', 'max_power'
    ]
    
    if(ndays > 0):
        for col in nday_cols:
            for i in range(ndays):
                colname = col + "-" + str(i + 1) + "day"
                daily[colname] = daily[col].shift(i+1)
            
        daily = daily.drop(columns = ['mean_power'])
        
    daily = daily.dropna()
    
    getWeekdays(daily, 'date')
    
    if(ndays > 0):
        runModel(daily, dropCols = ['date', 'max_power'])
    else:
        runModel(daily)

    return daily

In [8]:
for i in range(5):
    print("{} days:".format(i))
    resetDaily(ndays = i)
    print("\n\n")

0 days:
(292, 10) (74, 10) (292,) (74,)
R Squared Score: 0.9473
Root Mean Squared Error: 1423.62
Mean Absolute Error: 1102.46



1 days:
(292, 16) (73, 16) (292,) (73,)
R Squared Score: 0.9653
Root Mean Squared Error: 1134.64
Mean Absolute Error: 822.24



2 days:
(291, 22) (73, 22) (291,) (73,)
R Squared Score: 0.9755
Root Mean Squared Error: 936.32
Mean Absolute Error: 642.83



3 days:
(290, 28) (73, 28) (290,) (73,)
R Squared Score: 0.9566
Root Mean Squared Error: 1272.52
Mean Absolute Error: 803.07



4 days:
(289, 34) (73, 34) (289,) (73,)
R Squared Score: 0.9621
Root Mean Squared Error: 1186.66
Mean Absolute Error: 833.36



