In [249]:
import numpy as np
import pandas as pd


In [250]:
def TrainTestSplit(data, test_size = 0.15, scale = False, cols_to_transform=None, include_test_scale=False):
    
    df = data.copy()
    # get the index after which test set starts
    test_index = int(len(df)*(1-test_size))
    
    # StandardScaler fit on the entire dataset
    if scale and include_test_scale:
        scaler = StandardScaler()
        df[cols_to_transform] = scaler.fit_transform(df[cols_to_transform])
        
    X_train = df.drop('demand', axis = 1).iloc[:test_index]
    y_train = df.demand.iloc[:test_index]
    X_test = df.drop('demand', axis = 1).iloc[test_index:]
    y_test = df.demand.iloc[test_index:]
    
    # StandardScaler fit only on the training set
    if scale and not include_test_scale:
        scaler = StandardScaler()
        X_train[cols_to_transform] = scaler.fit_transform(X_train[cols_to_transform])
        X_test[cols_to_transform] = scaler.transform(X_test[cols_to_transform])
    
    return X_train, X_test, y_train, y_test

In [251]:
# Reading the gefcom dataset from rda file
df = pd.read_csv('./Data/cleandata/CleanedCT.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,ts,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
0,2004-01-01 00:00:00,CT,3126.000,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
1,2004-01-01 01:00:00,CT,2945.000,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2,2004-01-01 02:00:00,CT,2804.000,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
3,2004-01-01 03:00:00,CT,2729.000,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
4,2004-01-01 04:00:00,CT,2722.000,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113971,2016-12-31 19:00:00,CT,3744.918,40.0,29.0,2016-12-31,2016,12,19,Sat,366,True,False,121315.0,non-working
113972,2016-12-31 20:00:00,CT,3558.586,41.0,30.0,2016-12-31,2016,12,20,Sat,366,True,False,121316.0,non-working
113973,2016-12-31 21:00:00,CT,3378.466,38.0,32.0,2016-12-31,2016,12,21,Sat,366,True,False,121317.0,non-working
113974,2016-12-31 22:00:00,CT,3195.386,37.0,32.0,2016-12-31,2016,12,22,Sat,366,True,False,121318.0,non-working


In [252]:
df['day_of_week'] = df['day_of_week'].astype('category')
df['non_working'] = df['non_working'].astype('category')
df['month'] = df['month'].astype('category')
df['ts'] = pd.to_datetime(df['ts'])
CT =df.set_index('ts')
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [253]:
def season_calc(month):
    """adding season based on the data on SDGE's site -> https://www.sdge.com/whenmatters#how-it-works; 
       months from June to October are denoted as 'summer' and months from November to May as 'winter'. """
    if month in [6,7,8,9,10]:
        return "summer"
    else:
        return "winter"

In [254]:
CT.reset_index(inplace=True)
CT.head()

Unnamed: 0,ts,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
0,2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
1,2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2,2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
3,2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
4,2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [255]:
#CT.reset_index(inplace=True)
#CT = CT.drop(['level_0', 'index'], axis=1)
CT['season'] = CT.ts.dt.month.apply(season_calc)
CT = CT.set_index(['ts'])
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter


In [256]:
# Dividing the hours into 4 groups-> night, morning, afternoon, evening

hour_dict = {'morning': list(np.arange(7,13)),'afternoon': list(np.arange(13,16)), 'evening': list(np.arange(16,22)),
            'night': [22, 23, 0, 1, 2, 3, 4, 5, 6]}
hour_dict

{'morning': [7, 8, 9, 10, 11, 12],
 'afternoon': [13, 14, 15],
 'evening': [16, 17, 18, 19, 20, 21],
 'night': [22, 23, 0, 1, 2, 3, 4, 5, 6]}

In [257]:
def time_of_day(x):
    if x in hour_dict['morning']:
        return 'morning'
    elif x in hour_dict['afternoon']:
        return 'afternoon'
    elif x in hour_dict['evening']:
        return 'evening'
    else:
        return 'night'


In [258]:
CT['time_of_day'] = CT['hour'].apply(time_of_day)
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [259]:
# creating categorical columns for linear regression 
cat_cols1 = ['month', 'day_of_year', 'hour', 'day_of_week', 'season', 'holiday', 'non_working', 'time_of_day']
#not including year above to capture the decreasing energy trend over increasing value of years
for col in cat_cols1:
    CT[col] = CT[col].astype('category')

In [260]:
CT['year'] = CT['year'].astype('int64')
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [261]:
cols_use = ['demand', 'year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
CT1_lin = pd.get_dummies(CT[cols_use], drop_first = False)
print(CT1_lin.shape)
CT1_lin.head()

(113976, 12)


Unnamed: 0_level_0,demand,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_summer,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2004-01-01 00:00:00,3126.0,2004,33.0,26.0,0,0,0,1,1,0,0,1
2004-01-01 01:00:00,2945.0,2004,34.0,26.0,0,0,0,1,1,0,0,1
2004-01-01 02:00:00,2804.0,2004,40.0,26.0,0,0,0,1,1,0,0,1
2004-01-01 03:00:00,2729.0,2004,38.0,23.0,0,0,0,1,1,0,0,1
2004-01-01 04:00:00,2722.0,2004,37.0,21.0,0,0,0,1,1,0,0,1


In [262]:
from sklearn.ensemble import RandomForestRegressor

In [263]:
cols_to_transform = ['drybulb', 'dewpnt', 'year']
X_train, X_test, y_train, y_test = TrainTestSplit(CT1_lin, test_size = 0.15, scale = True, cols_to_transform=cols_to_transform, 
                                              include_test_scale=False)

In [264]:
X_train.head()

Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_summer,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-01-01 00:00:00,-1.580168,-0.949256,-0.660553,0,0,0,1,1,0,0,1
2004-01-01 01:00:00,-1.580168,-0.897289,-0.660553,0,0,0,1,1,0,0,1
2004-01-01 02:00:00,-1.580168,-0.585487,-0.660553,0,0,0,1,1,0,0,1
2004-01-01 03:00:00,-1.580168,-0.689421,-0.811779,0,0,0,1,1,0,0,1
2004-01-01 04:00:00,-1.580168,-0.741388,-0.912596,0,0,0,1,1,0,0,1


In [265]:
# Tuning Random forest
# n_estimators = number of trees in the forest
# max_features = max number of features considered for splitting a node

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(10, 200, 10, endpoint=True)]
max_features = ['auto', 'sqrt']
max_depth = list(range(1,6))
# Create the random grid
random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth':max_depth}
print(random_grid)

{'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 2, 3, 4, 5]}


In [266]:
#import randomsearchcv
from sklearn.model_selection import RandomizedSearchCV

# First create the base model to tune
rf = RandomForestRegressor()

# Creating a time series split as discussed in the Introduction
tscv = TimeSeriesSplit(n_splits=5)
# Random search of parameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               cv = tscv, verbose=2, random_state = 42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_
#rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 73, 'max_features': 'auto', 'max_depth': 5}

# 1. Creating our population

In [267]:
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [268]:
pop = CT.drop(['zone', 'date', 'day_of_week', 'day_of_year', 'weekend', 'holiday', 'trend', 'time_of_day', 'season', 'demand'], axis=1)
#CTtest = CTtest.drop(['index'], axis=1)
pop.head()

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working


In [269]:
scaler = StandardScaler()
cols_to_transform = ['drybulb', 'dewpnt', 'year']
scalar = scaler.fit(pop[cols_to_transform])

## 1.1 Turning our pop into X_train format

In [270]:
pop_forDemandPred = pop.copy()
pop_forDemandPred['season'] = pop_forDemandPred['month'].apply(season_calc)
pop_forDemandPred['time_of_day'] = pop_forDemandPred['hour'].apply(time_of_day)
pop_forDemandPred.head()

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working,winter,night
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working,winter,night
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working,winter,night
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working,winter,night
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working,winter,night


In [271]:
cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
pop_forDemandPred = pd.get_dummies(pop_forDemandPred[cols_use], drop_first = False)
print(pop_forDemandPred.shape)
pop_forDemandPred.head()

(113976, 11)


Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_summer,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-01-01 00:00:00,2004,33.0,26.0,0,0,0,1,1,0,0,1
2004-01-01 01:00:00,2004,34.0,26.0,0,0,0,1,1,0,0,1
2004-01-01 02:00:00,2004,40.0,26.0,0,0,0,1,1,0,0,1
2004-01-01 03:00:00,2004,38.0,23.0,0,0,0,1,1,0,0,1
2004-01-01 04:00:00,2004,37.0,21.0,0,0,0,1,1,0,0,1


In [272]:
pop_forDemandPred[cols_to_transform] = scaler.transform(pop_forDemandPred[cols_to_transform])
pop_forDemandPred.head()

Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_summer,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2004-01-01 00:00:00,-1.603278,-0.95769,-0.663635,0,0,0,1,1,0,0,1
2004-01-01 01:00:00,-1.603278,-0.905916,-0.663635,0,0,0,1,1,0,0,1
2004-01-01 02:00:00,-1.603278,-0.595267,-0.663635,0,0,0,1,1,0,0,1
2004-01-01 03:00:00,-1.603278,-0.698816,-0.814972,0,0,0,1,1,0,0,1
2004-01-01 04:00:00,-1.603278,-0.750591,-0.915863,0,0,0,1,1,0,0,1


In [273]:
rf_random.predict(pop_forDemandPred)

array([2967.78883189, 2956.37549973, 2780.23220803, ..., 3424.51311942,
       2950.38046196, 2950.38046196])

In [274]:
def predict_demand(df):
    output = df.copy()
    pop_forDemandPred = df.copy()

    pop_forDemandPred['season'] = pop_forDemandPred['month'].apply(season_calc)
    pop_forDemandPred['time_of_day'] = pop_forDemandPred['hour'].apply(time_of_day)

    cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
    cols_to_transform = ['drybulb', 'dewpnt', 'year']
    pop_forDemandPred = pd.get_dummies(pop_forDemandPred[cols_use], drop_first = False)
    pop_forDemandPred[cols_to_transform] = scaler.transform(pop_forDemandPred[cols_to_transform])

    predicted_values = rf_random.predict(pop_forDemandPred)
    output['demand'] = predicted_values

    return output

In [275]:
predict_demand(pop)

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working,demand
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working,2967.788832
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working,2956.375500
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working,2780.232208
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working,2956.375500
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working,2956.375500
...,...,...,...,...,...,...,...
2016-12-31 19:00:00,40.0,29.0,2016,12,19,non-working,3424.513119
2016-12-31 20:00:00,41.0,30.0,2016,12,20,non-working,3424.513119
2016-12-31 21:00:00,38.0,32.0,2016,12,21,non-working,3424.513119
2016-12-31 22:00:00,37.0,32.0,2016,12,22,non-working,2950.380462


# Selection

In [276]:
pop_small = pop.sample(1000)
print(pop_small.shape)
pop_small.reset_index(inplace=True, drop=True)
pop_small.head()

(1000, 6)


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,25.0,21.0,2009,1,0,non-working
1,64.0,32.0,2008,5,23,working
2,52.0,49.0,2015,4,19,working
3,59.0,44.0,2004,5,23,working
4,42.0,24.0,2007,3,1,non-working


In [277]:
popwithDemand = predict_demand(pop_small)
popwithDemand

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand
0,25.0,21.0,2009,1,0,non-working,3284.093127
1,64.0,32.0,2008,5,23,working,3099.870561
2,52.0,49.0,2015,4,19,working,3888.302483
3,59.0,44.0,2004,5,23,working,2693.642666
4,42.0,24.0,2007,3,1,non-working,2693.642666
...,...,...,...,...,...,...,...
995,74.0,62.0,2014,8,20,working,4389.939566
996,33.0,15.0,2004,2,15,working,4270.316824
997,33.0,16.0,2009,1,16,working,4152.052285
998,42.0,29.0,2007,1,8,non-working,3424.513119


In [278]:
def fitness(df):
    mu = df['demand'].mean()
    sd = df['demand'].std()

    Fitness = []
    for i in range(len(df)):
        z = ((df['demand'][i]) - mu) / sd
        Fitness.append(z)

    invFitness = []
    for i in range(len(Fitness)):
        x = Fitness[i] * (-1)
        invFitness.append(x)

    NormalFitness = []
    min = np.min(invFitness)
    max = np.max(invFitness)


    for i in range(len(invFitness)):
        n = (invFitness[i] - min) / (max - min)
        NormalFitness.append(n)

    df_Fitness = df.copy()
    df_Fitness['Fitness'] = NormalFitness

    return df_Fitness


In [279]:
eval = fitness(popwithDemand)
eval

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness
0,25.0,21.0,2009,1,0,non-working,3284.093127,0.823756
1,64.0,32.0,2008,5,23,working,3099.870561,0.878192
2,52.0,49.0,2015,4,19,working,3888.302483,0.645217
3,59.0,44.0,2004,5,23,working,2693.642666,0.998229
4,42.0,24.0,2007,3,1,non-working,2693.642666,0.998229
...,...,...,...,...,...,...,...,...
995,74.0,62.0,2014,8,20,working,4389.939566,0.496988
996,33.0,15.0,2004,2,15,working,4270.316824,0.532335
997,33.0,16.0,2009,1,16,working,4152.052285,0.567281
998,42.0,29.0,2007,1,8,non-working,3424.513119,0.782263


In [280]:
def roulette_selection(df):
    
    F = df['Fitness'].sum()
    df['SelectionProb'] = df['Fitness'] / F
    df['CumulativeProb'] = df['SelectionProb'].cumsum()

    selectors = np.random.random_sample((len(df),))

    dict_copy = df.to_dict('records')
    selectedChromsIndexes = []

    i = 0
    for selector in selectors:
        for r in dict_copy:
            if (r['CumulativeProb']) > selector:
                selectedChromsIndexes.append(r)
                break

    selected = pd.DataFrame(selectedChromsIndexes)

    return selected



In [281]:
selected = roulette_selection(eval)
selected

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness,SelectionProb,CumulativeProb
0,70.0,61.0,2015,5,2,non-working,3266.455477,0.828967,0.001168,0.028913
1,84.0,54.0,2008,9,16,non-working,4113.337775,0.578721,0.000815,0.160494
2,82.0,65.0,2011,5,13,non-working,4458.305761,0.476786,0.000672,0.837480
3,77.0,52.0,2006,8,11,working,4445.336565,0.480618,0.000677,0.358248
4,41.0,40.0,2012,12,1,non-working,2687.647628,1.000000,0.001409,0.007530
...,...,...,...,...,...,...,...,...,...,...
995,68.0,27.0,2009,4,14,working,3923.563918,0.634798,0.000894,0.727765
996,83.0,72.0,2013,7,9,working,5309.642665,0.225224,0.000317,0.312118
997,38.0,16.0,2008,2,15,non-working,3424.513119,0.782263,0.001102,0.979826
998,35.0,21.0,2008,1,19,non-working,4026.948474,0.604248,0.000851,0.107449


# Cross

In [282]:
select = selected.copy()
select['ProbCros'] = np.random.random(len(select))
select['Cross'] = select.apply(lambda x: 1 if (x['ProbCros'] < 0.25) else 0, axis=1)

In [283]:
select.head()

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness,SelectionProb,CumulativeProb,ProbCros,Cross
0,70.0,61.0,2015,5,2,non-working,3266.455477,0.828967,0.001168,0.028913,0.006406,1
1,84.0,54.0,2008,9,16,non-working,4113.337775,0.578721,0.000815,0.160494,0.301646,0
2,82.0,65.0,2011,5,13,non-working,4458.305761,0.476786,0.000672,0.83748,0.542543,0
3,77.0,52.0,2006,8,11,working,4445.336565,0.480618,0.000677,0.358248,0.226091,1
4,41.0,40.0,2012,12,1,non-working,2687.647628,1.0,0.001409,0.00753,0.204416,1


In [284]:
import random

In [285]:
def sp_crossover(chrom1, chrom2):
    cross_point = random.randint(1,5)
    c1_l = chrom1[0:cross_point]
    c1_r = chrom1[cross_point:len(chrom1)]

    c2_l = chrom2[0:cross_point]
    c2_r = chrom2[cross_point:len(chrom2)]

    child1 = np.concatenate((c1_l, c2_r))
    child2 = np.concatenate((c2_l, c1_r))

    return child1, child2

In [286]:
def cross(df, crossProb = 0.25):
    select = df.copy()
    select['ProbCros'] = np.random.random(len(select))
    select['Cross'] = select.apply(lambda x: 1 if (x['ProbCros'] < crossProb) else 0, axis=1)
    
    cross = select.loc[select['Cross'] == 1]
    cross = cross.iloc[:, 0:6]
    cross['group'] = 1
    cross['group'].iloc[(int(len(cross) / 2)):] = 2

    if len(cross[cross['group'] == 1]) != len(cross[cross['group'] == 2]):
        cross = cross.iloc[:-1 , :]
    cross.drop(['group'], inplace=True, axis=1)

    group1 = cross.iloc[:(int(len(cross) / 2))].to_numpy()
    group2 = cross.iloc[(int(len(cross) / 2)):].to_numpy()

    crossedGen = []
    for i, j in zip(group1, group2):
        children = sp_crossover(i, j)
        crossedGen.append(children[0])
        crossedGen.append(children[1])

    cross['children'] = crossedGen
    cross[:] = cross.pop('children').to_list()

    for i, j in cross.iterrows():
        select.iloc[i, 0:6] = j

    return (select.iloc[:, :6])

In [287]:
crossed = cross(selected)
print('Are the selected and crossed the same?:', selected.equals(crossed))
crossed

Are the selected and crossed the same?: False


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,70.0,61.0,2011,7,4,working
1,64.0,57.0,2015,5,2,non-working
2,84.0,54.0,2008,9,16,working
3,77.0,52.0,2006,8,11,working
4,41.0,40.0,2012,12,1,non-working
...,...,...,...,...,...,...
995,68.0,27.0,2009,4,14,working
996,83.0,72.0,2013,7,9,working
997,38.0,16.0,2008,2,15,non-working
998,35.0,34.0,2009,4,12,non-working


# Mutation

In [288]:
# non-working Mutation
nonWorkingMutated = []
for i in crossed['non_working'].values:
    print(i)
    if (np.random.random()) < 0.01:
        if i == 'working':
            x = 'non-working'
            nonWorkingMutated.append(x)
        else:
            x = 'working'
            nonWorkingMutated.append(x)
    else:
        k = i
        nonWorkingMutated.append(k)
    break

nonWorkingMutated


working


['working']

In [289]:
def mutate(df, mutation_rate=0.01):
    newTEST = df.copy()

    drybulbMutated = []
    for i in newTEST['drybulb'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['drybulb'].min()), (df['drybulb'].max() +1))
            drybulbMutated.append(x)
        else:
            k = i
            drybulbMutated.append(k)

    newTEST['drybulb'] = drybulbMutated

    #dewpnt Mutation
    dewpntMutated = []
    for i in newTEST['dewpnt'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['dewpnt'].min()), (df['dewpnt'].max() +1))
            dewpntMutated.append(x)
        else:
            k = i
            dewpntMutated.append(k)

    newTEST['dewpnt'] = dewpntMutated

    # Year Mutation
    yearMutated = []
    for i in newTEST['year'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['year'].min()), (df['year'].max() +1))
            yearMutated.append(x)
        else:
            k = i
            yearMutated.append(k)

    newTEST['year'] = yearMutated

    # Month Mutation
    monthMutated = []
    for i in newTEST['month'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['month'].min()), (df['month'].max() +1))
            monthMutated.append(x)
        else:
            k = i
            monthMutated.append(k)

    newTEST['month'] = monthMutated

    # Hour Mutation
    hourMutated = []
    for i in newTEST['hour'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['hour'].min()), (df['hour'].max() +1))
            hourMutated.append(x)
        else:
            k = i
            hourMutated.append(k)

    newTEST['hour'] = hourMutated

    # non-working Mutation
    nonWorkingMutated = []
    for i in crossed['non_working'].values:
        if (np.random.random()) < mutation_rate:
            if i == 'working':
                x = 'non-working'
                nonWorkingMutated.append(x)
            else:
                x = 'working'
                nonWorkingMutated.append(x)
        else:
            k = i
            nonWorkingMutated.append(k)

    newTEST['non_working'] = nonWorkingMutated

    return newTEST


In [290]:
mutated = mutate(crossed)
print('Are the crossed and mutated the same?:', crossed.equals(mutated))
mutated

Are the crossed and mutated the same?: False


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,70.0,61.0,2011,7,4,non-working
1,64.0,57.0,2015,5,2,non-working
2,84.0,54.0,2008,9,16,working
3,77.0,52.0,2006,8,11,working
4,41.0,40.0,2012,12,1,non-working
...,...,...,...,...,...,...
995,68.0,27.0,2009,4,14,working
996,83.0,72.0,2013,7,9,working
997,38.0,16.0,2008,2,15,non-working
998,35.0,34.0,2009,4,12,non-working


# Putting all together

In [291]:
def GA(df, NumOfGenerations= 50):
    pop = df.copy()
    for i in range(NumOfGenerations):
        popwithDemand = predict_demand(pop)
        eval = fitness(popwithDemand)
        selected = roulette_selection(eval)
        crossed = cross(selected)
        mutated = mutate(crossed)
        pop = mutated.copy()

    return pop

In [292]:
FinalGen = GA(pop_small, NumOfGenerations=50)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

ValueError: X has 10 features, but DecisionTreeRegressor is expecting 11 features as input.

In [92]:
def TrainTestSplit(data, test_size = 0.15, scale = False, cols_to_transform=None, include_test_scale=False):
    
    df = data.copy()
    # get the index after which test set starts
    test_index = int(len(df)*(1-test_size))
    
    # StandardScaler fit on the entire dataset
    if scale and include_test_scale:
        scaler = StandardScaler()
        df[cols_to_transform] = scaler.fit_transform(df[cols_to_transform])
        
    X_train = df.drop('demand', axis = 1).iloc[:test_index]
    y_train = df.demand.iloc[:test_index]
    X_test = df.drop('demand', axis = 1).iloc[test_index:]
    y_test = df.demand.iloc[test_index:]
    
    # StandardScaler fit only on the training set
    if scale and not include_test_scale:
        scaler = StandardScaler()
        X_train[cols_to_transform] = scaler.fit_transform(X_train[cols_to_transform])
        X_test[cols_to_transform] = scaler.transform(X_test[cols_to_transform])
    
    return X_train, X_test, y_train, y_test

In [93]:
# Importing the dataset
df = pd.read_csv('./Data/cleandata/CleanedCT.csv', parse_dates=['ts'], index_col='ts')
df = df.drop(['Unnamed: 0'], axis=1)
# df['non_working'] = df.apply(lambda x: True if \
#                                          ((x['non_working'] == 'non-working'))
#                                          else False, axis = 1)
print('Data Frame Shape:', df.shape)
df.head()

Data Frame Shape: (113976, 14)


Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [94]:
df['day_of_week'] = df['day_of_week'].astype('category')
df['non_working'] = df['non_working'].astype('category')
df['month'] = df['month'].astype('category')
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [95]:

def season(month):
    if month in [6,7,8,9,10]:
        return "summer"
    else:
        return "winter"

In [96]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,ts,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
0,2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
1,2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2,2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
3,2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
4,2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [97]:
df['season'] = df.ts.dt.month.apply(season)
df = df.set_index(['ts'])
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter


In [98]:
# Dividing the hours into 4 groups-> night, morning, afternoon, evening

hour_dict = {'morning': list(np.arange(7,13)),'afternoon': list(np.arange(13,16)), 'evening': list(np.arange(16,22)),
            'night': [22, 23, 0, 1, 2, 3, 4, 5, 6]}
hour_dict

{'morning': [7, 8, 9, 10, 11, 12],
 'afternoon': [13, 14, 15],
 'evening': [16, 17, 18, 19, 20, 21],
 'night': [22, 23, 0, 1, 2, 3, 4, 5, 6]}

In [99]:
def time_of_day(x):
    if x in hour_dict['morning']:
        return 'morning'
    elif x in hour_dict['afternoon']:
        return 'afternoon'
    elif x in hour_dict['evening']:
        return 'evening'
    else:
        return 'night'


In [100]:
df['time_of_day'] = df['hour'].apply(time_of_day)
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [101]:
# creating categorical columns for linear regression 
cat_cols1 = ['month', 'day_of_year', 'hour', 'day_of_week', 'season', 'holiday', 'non_working', 'time_of_day']
#not including year above to capture the decreasing energy trend over increasing value of years
for col in cat_cols1:
    df[col] = df[col].astype('category')

In [102]:
df['year'] = df['year'].astype('int64')
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [103]:
cols_use = ['demand', 'year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
CT1_lin = pd.get_dummies(df[cols_use], drop_first = True)
print(CT1_lin.shape)
CT1_lin.head()

(113976, 9)


Unnamed: 0_level_0,demand,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_working,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-01-01 00:00:00,3126.0,2004,33.0,26.0,0,0,1,0,1
2004-01-01 01:00:00,2945.0,2004,34.0,26.0,0,0,1,0,1
2004-01-01 02:00:00,2804.0,2004,40.0,26.0,0,0,1,0,1
2004-01-01 03:00:00,2729.0,2004,38.0,23.0,0,0,1,0,1
2004-01-01 04:00:00,2722.0,2004,37.0,21.0,0,0,1,0,1


In [104]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [105]:
cols_to_transform = ['drybulb', 'dewpnt', 'year']
X_train, X_test, y_train, y_test = TrainTestSplit(CT1_lin, test_size = 0.15, scale = True, cols_to_transform=cols_to_transform, 
                                              include_test_scale=False)

In [106]:
X_train.head()

Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_working,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,-1.580168,-0.949256,-0.660553,0,0,1,0,1
2004-01-01 01:00:00,-1.580168,-0.897289,-0.660553,0,0,1,0,1
2004-01-01 02:00:00,-1.580168,-0.585487,-0.660553,0,0,1,0,1
2004-01-01 03:00:00,-1.580168,-0.689421,-0.811779,0,0,1,0,1
2004-01-01 04:00:00,-1.580168,-0.741388,-0.912596,0,0,1,0,1


In [107]:
# Tuning Random forest
# n_estimators = number of trees in the forest
# max_features = max number of features considered for splitting a node

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(10, 200, 10, endpoint=True)]
max_features = ['auto', 'sqrt']
max_depth = list(range(1,6))
# Create the random grid
random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth':max_depth}
print(random_grid)

{'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 2, 3, 4, 5]}


In [108]:
#import randomsearchcv and time series split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit

# First create the base model to tune
rf = RandomForestRegressor()

# Creating a time series split as discussed in the Introduction
tscv = TimeSeriesSplit(n_splits=5)
# Random search of parameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               cv = tscv, verbose=2, random_state = 42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_
#rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 73, 'max_features': 'auto', 'max_depth': 5}

# 1. Creating our population

In [109]:
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [110]:
pop = df.drop(['zone', 'date', 'day_of_week', 'day_of_year', 'weekend', 'holiday', 'trend', 'time_of_day', 'season', 'demand'], axis=1)
#CTtest = CTtest.drop(['index'], axis=1)
pop.head()

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working


In [111]:
scaler = StandardScaler()
cols_to_transform = ['drybulb', 'dewpnt', 'year']
scalar = scaler.fit(pop[cols_to_transform])

## 1.1 Turning our pop into X_train format

In [112]:
pop_forDemandPred = pop.copy()
pop_forDemandPred['season'] = pop_forDemandPred['month'].apply(season)
pop_forDemandPred['time_of_day'] = pop_forDemandPred['hour'].apply(time_of_day)
pop_forDemandPred.head()

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working,winter,night
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working,winter,night
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working,winter,night
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working,winter,night
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working,winter,night


In [113]:
cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
pop_forDemandPred = pd.get_dummies(pop_forDemandPred[cols_use], drop_first = True)
print(pop_forDemandPred.shape)
pop_forDemandPred.head()

(113976, 8)


Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_working,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,2004,33.0,26.0,0,0,1,0,1
2004-01-01 01:00:00,2004,34.0,26.0,0,0,1,0,1
2004-01-01 02:00:00,2004,40.0,26.0,0,0,1,0,1
2004-01-01 03:00:00,2004,38.0,23.0,0,0,1,0,1
2004-01-01 04:00:00,2004,37.0,21.0,0,0,1,0,1


In [114]:
pop_forDemandPred[cols_to_transform] = scaler.transform(pop_forDemandPred[cols_to_transform])
print(pop_forDemandPred.shape)
pop_forDemandPred.head()

(113976, 8)


Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_working,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,-1.603278,-0.95769,-0.663635,0,0,1,0,1
2004-01-01 01:00:00,-1.603278,-0.905916,-0.663635,0,0,1,0,1
2004-01-01 02:00:00,-1.603278,-0.595267,-0.663635,0,0,1,0,1
2004-01-01 03:00:00,-1.603278,-0.698816,-0.814972,0,0,1,0,1
2004-01-01 04:00:00,-1.603278,-0.750591,-0.915863,0,0,1,0,1


In [115]:
rf_random.predict(pop_forDemandPred)

array([2954.00559239, 2954.00559239, 2822.4524446 , ..., 3423.63915934,
       2945.24228601, 2945.24228601])

## 1.2 Creating Predict Demand function

In [116]:
def predict_demand(df):
    output = df.copy()
    pop_forDemandPred = df.copy()

    pop_forDemandPred['season'] = pop_forDemandPred['month'].apply(season)
    pop_forDemandPred['time_of_day'] = pop_forDemandPred['hour'].apply(time_of_day)

    cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
    cols_to_transform = ['drybulb', 'dewpnt', 'year']
    pop_forDemandPred = pd.get_dummies(pop_forDemandPred[cols_use], drop_first = True)
    pop_forDemandPred[cols_to_transform] = scaler.transform(pop_forDemandPred[cols_to_transform])

    predicted_values = rf_random.predict(pop_forDemandPred)
    output['demand'] = predicted_values

    return output

In [117]:
predict_demand(pop)

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working,demand
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working,2954.005592
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working,2954.005592
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working,2822.452445
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working,2954.005592
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working,2954.005592
...,...,...,...,...,...,...,...
2016-12-31 19:00:00,40.0,29.0,2016,12,19,non-working,3423.639159
2016-12-31 20:00:00,41.0,30.0,2016,12,20,non-working,3423.639159
2016-12-31 21:00:00,38.0,32.0,2016,12,21,non-working,3423.639159
2016-12-31 22:00:00,37.0,32.0,2016,12,22,non-working,2945.242286


# Selection

In [118]:
pop_small = pop.sample(1000)
print(pop_small.shape)
pop_small.reset_index(inplace=True, drop=True)
pop_small.head()

(1000, 6)


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,70.0,65.0,2011,5,1,non-working
1,45.0,11.0,2006,3,16,working
2,42.0,32.0,2012,1,20,working
3,75.0,64.0,2014,9,13,working
4,64.0,58.0,2013,9,8,working


In [119]:
popwithDemand = predict_demand(pop_small)
popwithDemand

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand
0,70.0,65.0,2011,5,1,non-working,3270.199799
1,45.0,11.0,2006,3,16,working,3914.744067
2,42.0,32.0,2012,1,20,working,3889.117898
3,75.0,64.0,2014,9,13,working,4403.392502
4,64.0,58.0,2013,9,8,working,3889.117898
...,...,...,...,...,...,...,...
995,47.0,35.0,2013,10,1,working,2686.745057
996,78.0,47.0,2014,7,16,working,4566.458315
997,58.0,51.0,2010,7,7,working,3889.117898
998,29.0,24.0,2008,2,7,working,4542.130701


In [120]:
def fitness(df):
    mu = df['demand'].mean()
    sd = df['demand'].std()

    Fitness = []
    for i in range(len(df)):
        z = ((df['demand'][i]) - mu) / sd
        Fitness.append(z)

    invFitness = []
    for i in range(len(Fitness)):
        x = Fitness[i] * (-1)
        invFitness.append(x)

    NormalFitness = []
    min = np.min(invFitness)
    max = np.max(invFitness)


    for i in range(len(invFitness)):
        n = (invFitness[i] - min) / (max - min)
        NormalFitness.append(n)

    df_Fitness = df.copy()
    df_Fitness['Fitness'] = NormalFitness

    return df_Fitness


In [121]:
eval = fitness(popwithDemand)
eval

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness
0,70.0,65.0,2011,5,1,non-working,3270.199799,0.827741
1,45.0,11.0,2006,3,16,working,3914.744067,0.637447
2,42.0,32.0,2012,1,20,working,3889.117898,0.645013
3,75.0,64.0,2014,9,13,working,4403.392502,0.493179
4,64.0,58.0,2013,9,8,working,3889.117898,0.645013
...,...,...,...,...,...,...,...,...
995,47.0,35.0,2013,10,1,working,2686.745057,1.000000
996,78.0,47.0,2014,7,16,working,4566.458315,0.445035
997,58.0,51.0,2010,7,7,working,3889.117898,0.645013
998,29.0,24.0,2008,2,7,working,4542.130701,0.452218


In [122]:
def roulette_selection(df):
    
    F = df['Fitness'].sum()
    df['SelectionProb'] = df['Fitness'] / F
    df['CumulativeProb'] = df['SelectionProb'].cumsum()

    selectors = np.random.random_sample((len(df),))

    dict_copy = df.to_dict('records')
    selectedChromsIndexes = []

    i = 0
    for selector in selectors:
        for r in dict_copy:
            if (r['CumulativeProb']) > selector:
                selectedChromsIndexes.append(r)
                break

    selected = pd.DataFrame(selectedChromsIndexes)

    return selected

In [123]:
selected = roulette_selection(eval)
selected

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness,SelectionProb,CumulativeProb
0,34.0,12.0,2005,2,12,working,4127.606037,0.574602,0.000828,0.162499
1,53.0,52.0,2004,8,2,working,2695.508363,0.997413,0.001437,0.377177
2,52.0,21.0,2004,2,14,non-working,3423.639159,0.782440,0.001127,0.659026
3,51.0,49.0,2006,3,9,working,3914.744067,0.637447,0.000918,0.715922
4,75.0,62.0,2005,7,22,non-working,4100.726994,0.582537,0.000839,0.480601
...,...,...,...,...,...,...,...,...,...,...
995,84.0,57.0,2011,7,17,non-working,4124.207057,0.575605,0.000829,0.839844
996,36.0,20.0,2010,3,8,working,3889.117898,0.645013,0.000929,0.222851
997,24.0,16.0,2011,2,7,non-working,3805.506540,0.669698,0.000965,0.015730
998,34.0,7.0,2008,1,15,working,4127.606037,0.574602,0.000828,0.244788


# Cross

In [124]:
select = selected.copy()
select['ProbCros'] = np.random.random(len(select))
select['Cross'] = select.apply(lambda x: 1 if (x['ProbCros'] < 0.25) else 0, axis=1)

In [125]:
select.head()

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness,SelectionProb,CumulativeProb,ProbCros,Cross
0,34.0,12.0,2005,2,12,working,4127.606037,0.574602,0.000828,0.162499,0.041562,1
1,53.0,52.0,2004,8,2,working,2695.508363,0.997413,0.001437,0.377177,0.306934,0
2,52.0,21.0,2004,2,14,non-working,3423.639159,0.78244,0.001127,0.659026,0.855736,0
3,51.0,49.0,2006,3,9,working,3914.744067,0.637447,0.000918,0.715922,0.104726,1
4,75.0,62.0,2005,7,22,non-working,4100.726994,0.582537,0.000839,0.480601,0.603557,0


In [126]:
import random

In [127]:
def sp_crossover(chrom1, chrom2):
    cross_point = random.randint(1,5)
    c1_l = chrom1[0:cross_point]
    c1_r = chrom1[cross_point:len(chrom1)]

    c2_l = chrom2[0:cross_point]
    c2_r = chrom2[cross_point:len(chrom2)]

    child1 = np.concatenate((c1_l, c2_r))
    child2 = np.concatenate((c2_l, c1_r))

    return child1, child2

In [128]:
def cross(df, crossProb = 0.25):
    select = df.copy()
    select['ProbCros'] = np.random.random(len(select))
    select['Cross'] = select.apply(lambda x: 1 if (x['ProbCros'] < crossProb) else 0, axis=1)
    
    cross = select.loc[select['Cross'] == 1]
    cross = cross.iloc[:, 0:6]
    cross['group'] = 1
    cross['group'].iloc[(int(len(cross) / 2)):] = 2

    if len(cross[cross['group'] == 1]) != len(cross[cross['group'] == 2]):
        cross = cross.iloc[:-1 , :]
    cross.drop(['group'], inplace=True, axis=1)

    group1 = cross.iloc[:(int(len(cross) / 2))].to_numpy()
    group2 = cross.iloc[(int(len(cross) / 2)):].to_numpy()

    crossedGen = []
    for i, j in zip(group1, group2):
        children = sp_crossover(i, j)
        crossedGen.append(children[0])
        crossedGen.append(children[1])

    cross['children'] = crossedGen
    cross[:] = cross.pop('children').to_list()

    for i, j in cross.iterrows():
        select.iloc[i, 0:6] = j

    return (select.iloc[:, :6])

In [129]:
crossed = cross(selected)
print('Are the selected and crossed the same?:', selected.equals(crossed))
crossed

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Are the selected and crossed the same?: False


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,34.0,12.0,2008,9,19,working
1,53.0,52.0,2004,8,2,working
2,65.0,53.0,2005,2,12,working
3,51.0,49.0,2006,3,9,working
4,75.0,62.0,2005,7,22,non-working
...,...,...,...,...,...,...
995,84.0,57.0,2011,7,17,non-working
996,36.0,20.0,2010,3,8,working
997,24.0,16.0,2011,2,7,non-working
998,34.0,7.0,2008,1,15,working


# Mutation

In [130]:
def mutate(df, mutation_rate=0.01):
    newTEST = df.copy()

    drybulbMutated = []
    for i in newTEST['drybulb'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['drybulb'].min()), (df['drybulb'].max() +1))
            drybulbMutated.append(x)
        else:
            k = i
            drybulbMutated.append(k)

    newTEST['drybulb'] = drybulbMutated

    #dewpnt Mutation
    dewpntMutated = []
    for i in newTEST['dewpnt'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['dewpnt'].min()), (df['dewpnt'].max() +1))
            dewpntMutated.append(x)
        else:
            k = i
            dewpntMutated.append(k)

    newTEST['dewpnt'] = dewpntMutated

    # Year Mutation
    yearMutated = []
    for i in newTEST['year'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['year'].min()), (df['year'].max() +1))
            yearMutated.append(x)
        else:
            k = i
            yearMutated.append(k)

    newTEST['year'] = yearMutated

    # Month Mutation
    monthMutated = []
    for i in newTEST['month'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['month'].min()), (df['month'].max() +1))
            monthMutated.append(x)
        else:
            k = i
            monthMutated.append(k)

    newTEST['month'] = monthMutated

    # Hour Mutation
    hourMutated = []
    for i in newTEST['hour'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['hour'].min()), (df['hour'].max() +1))
            hourMutated.append(x)
        else:
            k = i
            hourMutated.append(k)

    newTEST['hour'] = hourMutated

    # non-working Mutation
    nonWorkingMutated = []
    for i in crossed['non_working'].values:
        if (np.random.random()) < mutation_rate:
            if i == 'working':
                x = 'non-working'
                nonWorkingMutated.append(x)
            else:
                x = 'working'
                nonWorkingMutated.append(x)
        else:
            k = i
            nonWorkingMutated.append(k)

    newTEST['non_working'] = nonWorkingMutated

    return newTEST


In [131]:
mutated = mutate(crossed)
print('Are the crossed and mutated the same?:', crossed.equals(mutated))
mutated

Are the crossed and mutated the same?: False


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,34.0,12.0,2008,9,19,working
1,53.0,52.0,2004,8,2,working
2,65.0,53.0,2004,2,12,working
3,51.0,49.0,2006,3,9,working
4,75.0,62.0,2005,7,22,non-working
...,...,...,...,...,...,...
995,84.0,57.0,2011,7,17,non-working
996,36.0,20.0,2010,3,8,working
997,24.0,16.0,2011,2,7,non-working
998,34.0,7.0,2008,1,15,working


# Putting all together

In [132]:
def GA(df, NumOfGenerations= 50):
    pop = df.copy()
    for i in range(NumOfGenerations):
        popwithDemand = predict_demand(pop)
        eval = fitness(popwithDemand)
        selected = roulette_selection(eval)
        crossed = cross(selected)
        mutated = mutate(crossed)
        pop = mutated.copy()

    return pop

In [139]:
FinalGen = GA(pop_small, NumOfGenerations=50)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

ValueError: X has 7 features, but DecisionTreeRegressor is expecting 8 features as input.

In [138]:
FinalGen

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,45.0,31.0,2012,1,22,working
1,55.0,55.0,2016,4,0,working
2,52.0,45.0,2005,3,2,working
3,32.0,30.0,2015,8,1,working
4,52.0,51.0,2011,5,1,non-working
...,...,...,...,...,...,...
995,56.0,65.0,2007,1,3,non-working
996,55.0,38.0,2014,5,5,working
997,55.0,55.0,2009,5,23,non-working
998,62.0,38.0,2010,3,22,non-working
