In [416]:
import numpy as np
import pandas as pd


In [344]:
def TrainTestSplit(data, test_size = 0.15, scale = False, cols_to_transform=None, include_test_scale=False):
    
    df = data.copy()
    # get the index after which test set starts
    test_index = int(len(df)*(1-test_size))
    
    # StandardScaler fit on the entire dataset
    if scale and include_test_scale:
        scaler = StandardScaler()
        df[cols_to_transform] = scaler.fit_transform(df[cols_to_transform])
        
    X_train = df.drop('demand', axis = 1).iloc[:test_index]
    y_train = df.demand.iloc[:test_index]
    X_test = df.drop('demand', axis = 1).iloc[test_index:]
    y_test = df.demand.iloc[test_index:]
    
    # StandardScaler fit only on the training set
    if scale and not include_test_scale:
        scaler = StandardScaler()
        X_train[cols_to_transform] = scaler.fit_transform(X_train[cols_to_transform])
        X_test[cols_to_transform] = scaler.transform(X_test[cols_to_transform])
    
    return X_train, X_test, y_train, y_test

In [345]:
# Reading the gefcom dataset from rda file
df = pd.read_csv('./Data/cleandata/CleanedCT.csv')
df = df.drop(['Unnamed: 0'], axis=1)
df

Unnamed: 0,ts,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
0,2004-01-01 00:00:00,CT,3126.000,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
1,2004-01-01 01:00:00,CT,2945.000,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2,2004-01-01 02:00:00,CT,2804.000,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
3,2004-01-01 03:00:00,CT,2729.000,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
4,2004-01-01 04:00:00,CT,2722.000,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
113971,2016-12-31 19:00:00,CT,3744.918,40.0,29.0,2016-12-31,2016,12,19,Sat,366,True,False,121315.0,non-working
113972,2016-12-31 20:00:00,CT,3558.586,41.0,30.0,2016-12-31,2016,12,20,Sat,366,True,False,121316.0,non-working
113973,2016-12-31 21:00:00,CT,3378.466,38.0,32.0,2016-12-31,2016,12,21,Sat,366,True,False,121317.0,non-working
113974,2016-12-31 22:00:00,CT,3195.386,37.0,32.0,2016-12-31,2016,12,22,Sat,366,True,False,121318.0,non-working


In [346]:
df['day_of_week'] = df['day_of_week'].astype('category')
df['non_working'] = df['non_working'].astype('category')
df['month'] = df['month'].astype('category')
df['ts'] = pd.to_datetime(df['ts'])
CT =df.set_index('ts')
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [347]:
def season_calc(month):
    if month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Fall'
    else:
        return 'Winter'

In [348]:
CT.reset_index(inplace=True)
CT.head()

Unnamed: 0,ts,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
0,2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
1,2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2,2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
3,2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
4,2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [349]:
#CT.reset_index(inplace=True)
#CT = CT.drop(['level_0', 'index'], axis=1)
CT['season'] = CT.ts.dt.month.apply(season_calc)
CT = CT.set_index(['ts'])
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,Winter
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,Winter
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,Winter
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,Winter
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,Winter


In [350]:
# Dividing the hours into 4 groups-> night, morning, afternoon, evening

hour_dict = {'morning': list(np.arange(7,13)),'afternoon': list(np.arange(13,16)), 'evening': list(np.arange(16,22)),
            'night': [22, 23, 0, 1, 2, 3, 4, 5, 6]}
hour_dict

{'morning': [7, 8, 9, 10, 11, 12],
 'afternoon': [13, 14, 15],
 'evening': [16, 17, 18, 19, 20, 21],
 'night': [22, 23, 0, 1, 2, 3, 4, 5, 6]}

In [351]:
def time_of_day(x):
    if x in hour_dict['morning']:
        return 'morning'
    elif x in hour_dict['afternoon']:
        return 'afternoon'
    elif x in hour_dict['evening']:
        return 'evening'
    else:
        return 'night'


In [352]:
CT['time_of_day'] = CT['hour'].apply(time_of_day)
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,Winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,Winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,Winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,Winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,Winter,night


In [353]:
# creating categorical columns for linear regression 
cat_cols1 = ['month', 'day_of_year', 'hour', 'day_of_week', 'season', 'holiday', 'non_working', 'time_of_day']
#not including year above to capture the decreasing energy trend over increasing value of years
for col in cat_cols1:
    CT[col] = CT[col].astype('category')

In [354]:
CT['year'] = CT['year'].astype('int64')
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,Winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,Winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,Winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,Winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,Winter,night


In [355]:
cols_use = ['demand', 'year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
CT1_lin = pd.get_dummies(CT[cols_use], drop_first = False)
print(CT1_lin.shape)
CT1_lin.head()

(113976, 14)


Unnamed: 0_level_0,demand,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_Fall,season_Spring,season_Summer,season_Winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-01-01 00:00:00,3126.0,2004,33.0,26.0,0,0,0,1,1,0,0,0,0,1
2004-01-01 01:00:00,2945.0,2004,34.0,26.0,0,0,0,1,1,0,0,0,0,1
2004-01-01 02:00:00,2804.0,2004,40.0,26.0,0,0,0,1,1,0,0,0,0,1
2004-01-01 03:00:00,2729.0,2004,38.0,23.0,0,0,0,1,1,0,0,0,0,1
2004-01-01 04:00:00,2722.0,2004,37.0,21.0,0,0,0,1,1,0,0,0,0,1


In [356]:
from sklearn.ensemble import RandomForestRegressor

In [357]:
cols_to_transform = ['drybulb', 'dewpnt', 'year']
X_train, X_test, y_train, y_test = TrainTestSplit(CT1_lin, test_size = 0.15, scale = True, cols_to_transform=cols_to_transform, 
                                              include_test_scale=False)

In [358]:
X_train.head()

Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_Fall,season_Spring,season_Summer,season_Winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-01-01 00:00:00,-1.580168,-0.949256,-0.660553,0,0,0,1,1,0,0,0,0,1
2004-01-01 01:00:00,-1.580168,-0.897289,-0.660553,0,0,0,1,1,0,0,0,0,1
2004-01-01 02:00:00,-1.580168,-0.585487,-0.660553,0,0,0,1,1,0,0,0,0,1
2004-01-01 03:00:00,-1.580168,-0.689421,-0.811779,0,0,0,1,1,0,0,0,0,1
2004-01-01 04:00:00,-1.580168,-0.741388,-0.912596,0,0,0,1,1,0,0,0,0,1


In [359]:
# Tuning Random forest
# n_estimators = number of trees in the forest
# max_features = max number of features considered for splitting a node

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(10, 200, 10, endpoint=True)]
max_features = ['auto', 'sqrt']
max_depth = list(range(1,6))
# Create the random grid
random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth':max_depth}
print(random_grid)

{'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 2, 3, 4, 5]}


In [360]:
#import randomsearchcv
from sklearn.model_selection import RandomizedSearchCV

# First create the base model to tune
rf = RandomForestRegressor()

# Creating a time series split as discussed in the Introduction
tscv = TimeSeriesSplit(n_splits=5)
# Random search of parameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               cv = tscv, verbose=2, random_state = 42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_
#rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 73, 'max_features': 'auto', 'max_depth': 5}

# 1. Creating our population

In [361]:
CT.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,Winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,Winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,Winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,Winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,Winter,night


In [362]:
pop = CT.drop(['zone', 'date', 'day_of_week', 'day_of_year', 'weekend', 'holiday', 'trend', 'time_of_day', 'season', 'demand'], axis=1)
#CTtest = CTtest.drop(['index'], axis=1)
pop.head()

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working


In [363]:
scaler = StandardScaler()
cols_to_transform = ['drybulb', 'dewpnt', 'year']
scalar = scaler.fit(pop[cols_to_transform])

## 1.1 Turning our pop into X_train format

In [364]:
pop_forDemandPred = pop.copy()
pop_forDemandPred['season'] = pop_forDemandPred['month'].apply(season_calc)
pop_forDemandPred['time_of_day'] = pop_forDemandPred['hour'].apply(time_of_day)
pop_forDemandPred.head()

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working,Winter,night
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working,Winter,night
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working,Winter,night
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working,Winter,night
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working,Winter,night


In [365]:
cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
pop_forDemandPred = pd.get_dummies(pop_forDemandPred[cols_use], drop_first = False)
print(pop_forDemandPred.shape)
pop_forDemandPred.head()

(113976, 13)


Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_Fall,season_Spring,season_Summer,season_Winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-01-01 00:00:00,2004,33.0,26.0,0,0,0,1,1,0,0,0,0,1
2004-01-01 01:00:00,2004,34.0,26.0,0,0,0,1,1,0,0,0,0,1
2004-01-01 02:00:00,2004,40.0,26.0,0,0,0,1,1,0,0,0,0,1
2004-01-01 03:00:00,2004,38.0,23.0,0,0,0,1,1,0,0,0,0,1
2004-01-01 04:00:00,2004,37.0,21.0,0,0,0,1,1,0,0,0,0,1


In [366]:
pop_forDemandPred[cols_to_transform] = scaler.transform(pop_forDemandPred[cols_to_transform])
pop_forDemandPred.head()

Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_Fall,season_Spring,season_Summer,season_Winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2004-01-01 00:00:00,-1.603278,-0.95769,-0.663635,0,0,0,1,1,0,0,0,0,1
2004-01-01 01:00:00,-1.603278,-0.905916,-0.663635,0,0,0,1,1,0,0,0,0,1
2004-01-01 02:00:00,-1.603278,-0.595267,-0.663635,0,0,0,1,1,0,0,0,0,1
2004-01-01 03:00:00,-1.603278,-0.698816,-0.814972,0,0,0,1,1,0,0,0,0,1
2004-01-01 04:00:00,-1.603278,-0.750591,-0.915863,0,0,0,1,1,0,0,0,0,1


In [367]:
rf_random.predict(pop_forDemandPred)

array([2957.85195678, 2957.85195678, 2830.00391639, ..., 4166.32757353,
       2957.85195678, 2957.85195678])

In [368]:
def predict_demand(df):
    output = df.copy()
    pop_forDemandPred = df.copy()

    pop_forDemandPred['season'] = pop_forDemandPred['month'].apply(season_calc)
    pop_forDemandPred['time_of_day'] = pop_forDemandPred['hour'].apply(time_of_day)

    cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
    cols_to_transform = ['drybulb', 'dewpnt', 'year']
    pop_forDemandPred = pd.get_dummies(pop_forDemandPred[cols_use], drop_first = False)
    pop_forDemandPred[cols_to_transform] = scaler.transform(pop_forDemandPred[cols_to_transform])

    predicted_values = rf_random.predict(pop_forDemandPred)
    output['demand'] = predicted_values

    return output

In [369]:
predict_demand(pop)

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working,demand
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working,2957.851957
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working,2957.851957
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working,2830.003916
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working,2957.851957
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working,2957.851957
...,...,...,...,...,...,...,...
2016-12-31 19:00:00,40.0,29.0,2016,12,19,non-working,4166.327574
2016-12-31 20:00:00,41.0,30.0,2016,12,20,non-working,4166.327574
2016-12-31 21:00:00,38.0,32.0,2016,12,21,non-working,4166.327574
2016-12-31 22:00:00,37.0,32.0,2016,12,22,non-working,2957.851957


# Selection

In [392]:
pop_small = pop.sample(1000)
print(pop_small.shape)
pop_small.reset_index(inplace=True, drop=True)
pop_small.head()

(1000, 6)


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,64.0,16.0,2016,4,18,working
1,55.0,48.0,2012,10,21,working
2,29.0,13.0,2013,11,23,non-working
3,50.0,37.0,2016,11,10,non-working
4,41.0,27.0,2006,3,22,non-working


In [371]:
popwithDemand = predict_demand(pop_small)
popwithDemand

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand
0,70.0,42.0,2004,4,18,non-working,3418.974145
1,-2.0,-12.0,2015,2,7,working,4401.033707
2,71.0,63.0,2014,8,0,working,3473.652407
3,59.0,57.0,2007,9,4,working,2689.515463
4,31.0,22.0,2008,12,11,working,4539.732755
...,...,...,...,...,...,...,...
995,47.0,41.0,2010,3,13,non-working,3394.754843
996,71.0,57.0,2014,6,11,non-working,3503.160881
997,22.0,6.0,2007,2,11,working,4574.874963
998,81.0,69.0,2006,7,11,working,5325.345588


In [372]:
def fitness(df):
    mu = df['demand'].mean()
    sd = df['demand'].std()

    Fitness = []
    for i in range(len(df)):
        z = ((df['demand'][i]) - mu) / sd
        Fitness.append(z)

    invFitness = []
    for i in range(len(Fitness)):
        x = Fitness[i] * (-1)
        invFitness.append(x)

    NormalFitness = []
    min = np.min(invFitness)
    max = np.max(invFitness)


    for i in range(len(invFitness)):
        n = (invFitness[i] - min) / (max - min)
        NormalFitness.append(n)

    df_Fitness = df.copy()
    df_Fitness['Fitness'] = NormalFitness

    return df_Fitness


In [373]:
eval = fitness(popwithDemand)
eval

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness
0,70.0,42.0,2004,4,18,non-working,3418.974145,0.785009
1,-2.0,-12.0,2015,2,7,working,4401.033707,0.495569
2,71.0,63.0,2014,8,0,working,3473.652407,0.768893
3,59.0,57.0,2007,9,4,working,2689.515463,1.000000
4,31.0,22.0,2008,12,11,working,4539.732755,0.454690
...,...,...,...,...,...,...,...,...
995,47.0,41.0,2010,3,13,non-working,3394.754843,0.792147
996,71.0,57.0,2014,6,11,non-working,3503.160881,0.760196
997,22.0,6.0,2007,2,11,working,4574.874963,0.444333
998,81.0,69.0,2006,7,11,working,5325.345588,0.223149


In [374]:
def roulette_selection(df):
    
    F = df['Fitness'].sum()
    df['SelectionProb'] = df['Fitness'] / F
    df['CumulativeProb'] = df['SelectionProb'].cumsum()

    selectors = np.random.random_sample((len(df),))

    dict_copy = df.to_dict('records')
    selectedChromsIndexes = []

    i = 0
    for selector in selectors:
        for r in dict_copy:
            if (r['CumulativeProb']) > selector:
                selectedChromsIndexes.append(r)
                break

    selected = pd.DataFrame(selectedChromsIndexes)

    return selected



In [375]:
selected = roulette_selection(eval)
selected

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness,SelectionProb,CumulativeProb
0,61.0,58.0,2011,10,11,non-working,3394.754843,0.792147,0.001117,0.018817
1,58.0,48.0,2015,4,23,working,2689.515463,1.000000,0.001410,0.151085
2,65.0,62.0,2014,10,1,working,2978.972567,0.914689,0.001290,0.440163
3,56.0,52.0,2004,4,15,working,3919.301461,0.637548,0.000899,0.025846
4,36.0,18.0,2006,3,2,working,2953.299124,0.922256,0.001301,0.307054
...,...,...,...,...,...,...,...,...,...,...
995,42.0,42.0,2015,12,5,working,2694.068296,0.998658,0.001408,0.260299
996,79.0,57.0,2006,8,15,working,4799.488732,0.378133,0.000533,0.693424
997,63.0,56.0,2007,8,22,working,2700.396755,0.996793,0.001406,0.288093
998,55.0,54.0,2006,11,4,working,2689.515463,1.000000,0.001410,0.867719


# Cross

In [376]:
select = selected.copy()
select['ProbCros'] = np.random.random(len(select))
select['Cross'] = select.apply(lambda x: 1 if (x['ProbCros'] < 0.25) else 0, axis=1)

In [377]:
select.head()

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness,SelectionProb,CumulativeProb,ProbCros,Cross
0,61.0,58.0,2011,10,11,non-working,3394.754843,0.792147,0.001117,0.018817,0.690791,0
1,58.0,48.0,2015,4,23,working,2689.515463,1.0,0.00141,0.151085,0.738119,0
2,65.0,62.0,2014,10,1,working,2978.972567,0.914689,0.00129,0.440163,0.307835,0
3,56.0,52.0,2004,4,15,working,3919.301461,0.637548,0.000899,0.025846,0.432446,0
4,36.0,18.0,2006,3,2,working,2953.299124,0.922256,0.001301,0.307054,0.595262,0


In [378]:
import random

In [379]:
cross_point = random.randint(1,5)
cross_point

3

In [380]:
def sp_crossover(chrom1, chrom2):
    cross_point = random.randint(1,5)
    c1_l = chrom1[0:cross_point]
    c1_r = chrom1[cross_point:len(chrom1)]

    c2_l = chrom2[0:cross_point]
    c2_r = chrom2[cross_point:len(chrom2)]

    child1 = np.concatenate((c1_l, c2_r))
    child2 = np.concatenate((c2_l, c1_r))

    return child1, child2

In [381]:
def cross(df, crossProb = 0.25):
    select = df.copy()
    select['ProbCros'] = np.random.random(len(select))
    select['Cross'] = select.apply(lambda x: 1 if (x['ProbCros'] < crossProb) else 0, axis=1)
    
    cross = select.loc[select['Cross'] == 1]
    cross = cross.iloc[:, 0:6]
    cross['group'] = 1
    cross['group'].iloc[(int(len(cross) / 2)):] = 2

    if len(cross[cross['group'] == 1]) != len(cross[cross['group'] == 2]):
        cross = cross.iloc[:-1 , :]
    cross.drop(['group'], inplace=True, axis=1)

    group1 = cross.iloc[:(int(len(cross) / 2))].to_numpy()
    group2 = cross.iloc[(int(len(cross) / 2)):].to_numpy()

    crossedGen = []
    for i, j in zip(group1, group2):
        children = sp_crossover(i, j)
        crossedGen.append(children[0])
        crossedGen.append(children[1])

    cross['children'] = crossedGen
    cross[:] = cross.pop('children').to_list()

    for i, j in cross.iterrows():
        select.iloc[i, 0:6] = j

    return (select.iloc[:, :6])

In [382]:
crossed = cross(selected)
print('Are the selected and crossed the same?:', selected.equals(crossed))
crossed

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Are the selected and crossed the same?: False


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,61.0,58.0,2011,10,11,non-working
1,58.0,48.0,2015,4,23,working
2,65.0,62.0,2014,10,1,working
3,56.0,52.0,2004,4,15,working
4,36.0,18.0,2006,3,2,working
...,...,...,...,...,...,...
995,42.0,42.0,2015,12,5,working
996,79.0,57.0,2006,8,15,working
997,63.0,56.0,2007,8,22,working
998,55.0,54.0,2006,11,4,working


# Mutation

In [383]:
# non-working Mutation
nonWorkingMutated = []
for i in crossed['non_working'].values:
    print(i)
    if (np.random.random()) < 0.01:
        if i == 'working':
            x = 'non-working'
            nonWorkingMutated.append(x)
        else:
            x = 'working'
            nonWorkingMutated.append(x)
    else:
        k = i
        nonWorkingMutated.append(k)
    break

nonWorkingMutated


non-working


['non-working']

In [384]:
def mutate(df, mutation_rate=0.01):
    newTEST = df.copy()

    drybulbMutated = []
    for i in newTEST['drybulb'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['drybulb'].min()), (df['drybulb'].max() +1))
            drybulbMutated.append(x)
        else:
            k = i
            drybulbMutated.append(k)

    newTEST['drybulb'] = drybulbMutated

    #dewpnt Mutation
    dewpntMutated = []
    for i in newTEST['dewpnt'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['dewpnt'].min()), (df['dewpnt'].max() +1))
            dewpntMutated.append(x)
        else:
            k = i
            dewpntMutated.append(k)

    newTEST['dewpnt'] = dewpntMutated

    # Year Mutation
    yearMutated = []
    for i in newTEST['year'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['year'].min()), (df['year'].max() +1))
            yearMutated.append(x)
        else:
            k = i
            yearMutated.append(k)

    newTEST['year'] = yearMutated

    # Month Mutation
    monthMutated = []
    for i in newTEST['month'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['month'].min()), (df['month'].max() +1))
            monthMutated.append(x)
        else:
            k = i
            monthMutated.append(k)

    newTEST['month'] = monthMutated

    # Hour Mutation
    hourMutated = []
    for i in newTEST['hour'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['hour'].min()), (df['hour'].max() +1))
            hourMutated.append(x)
        else:
            k = i
            hourMutated.append(k)

    newTEST['hour'] = hourMutated

    # non-working Mutation
    nonWorkingMutated = []
    for i in crossed['non_working'].values:
        if (np.random.random()) < mutation_rate:
            if i == 'working':
                x = 'non-working'
                nonWorkingMutated.append(x)
            else:
                x = 'working'
                nonWorkingMutated.append(x)
        else:
            k = i
            nonWorkingMutated.append(k)

    newTEST['non_working'] = nonWorkingMutated

    return newTEST


In [385]:
mutated = mutate(crossed)
print('Are the crossed and mutated the same?:', crossed.equals(mutated))
mutated

Are the crossed and mutated the same?: False


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,61.0,58.0,2011,10,11,non-working
1,58.0,48.0,2015,4,23,working
2,65.0,62.0,2014,10,1,working
3,56.0,52.0,2004,4,15,working
4,36.0,18.0,2006,3,2,working
...,...,...,...,...,...,...
995,42.0,42.0,2015,12,5,working
996,79.0,57.0,2006,8,15,working
997,63.0,56.0,2007,8,22,working
998,55.0,54.0,2006,11,4,working


# Putting all together

In [386]:
def GA(df, NumOfGenerations= 50):
    pop = df.copy()
    for i in range(NumOfGenerations):
        popwithDemand = predict_demand(pop)
        eval = fitness(popwithDemand)
        selected = roulette_selection(eval)
        crossed = cross(selected)
        mutated = mutate(crossed)
        pop = mutated.copy()

    return pop

In [None]:
FinalGen = GA(pop_small, NumOfGenerations=10)

In [393]:
pop_small

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,64.0,16.0,2016,4,18,working
1,55.0,48.0,2012,10,21,working
2,29.0,13.0,2013,11,23,non-working
3,50.0,37.0,2016,11,10,non-working
4,41.0,27.0,2006,3,22,non-working
...,...,...,...,...,...,...
995,49.0,29.0,2006,12,12,non-working
996,56.0,45.0,2015,4,1,non-working
997,23.0,9.0,2005,2,20,working
998,25.0,18.0,2013,1,2,non-working


In [402]:
# pop = pop_small.copy()
popwithDemand = predict_demand(pop)
eval = fitness(popwithDemand)
selected = roulette_selection(eval)
crossed = cross(selected)
mutated = mutate(crossed)
pop = mutated.copy()
pop

ValueError: X has 12 features, but DecisionTreeRegressor is expecting 13 features as input.

In [414]:
popfailure = pop.copy()
popfailure

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,season,time_of_day
0,50.0,40.0,2014,10,0,working,Fall,night
1,47.0,4.0,2007,2,4,working,Winter,night
2,50.0,25.0,2009,4,5,working,Spring,night
3,57.0,17.0,2012,11,23,working,Fall,night
4,34.0,49.0,2016,6,1,working,Summer,night
...,...,...,...,...,...,...,...,...
995,58.0,55.0,2013,12,23,working,Winter,night
996,50.0,25.0,2015,4,22,working,Spring,night
997,50.0,25.0,2015,7,22,non-working,Summer,night
998,58.0,52.0,2009,9,0,working,Fall,night


In [415]:


popfailure['season'] = popfailure['month'].apply(season_calc)
popfailure['time_of_day'] = popfailure['hour'].apply(time_of_day)

cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
cols_to_transform = ['drybulb', 'dewpnt', 'year']
popfailure = pd.get_dummies(popfailure[cols_use], drop_first = False)
popfailure[cols_to_transform] = scaler.transform(popfailure[cols_to_transform])
popfailure

# predicted_values = rf_random.predict(popfailure)
# predicted_values



Unnamed: 0,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_Fall,season_Spring,season_Summer,season_Winter
0,1.068852,-0.077519,0.042603,0,0,1,0,1,1,0,0,0
1,-0.801639,-0.232843,-1.773439,0,0,1,0,1,0,0,0,1
2,-0.267213,-0.077519,-0.714081,0,0,1,0,1,0,1,0,0
3,0.534426,0.284905,-1.117646,0,0,1,0,1,1,0,0,0
4,1.603278,-0.905916,0.496614,0,0,1,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.801639,0.336680,0.799288,0,0,1,0,1,0,0,0,1
996,1.336065,-0.077519,-0.714081,0,0,1,0,1,0,1,0,0
997,1.336065,-0.077519,-0.714081,0,0,1,1,0,0,0,1,0
998,-0.267213,0.336680,0.647951,0,0,1,0,1,1,0,0,0


In [413]:
pop_small['season'] = pop_small['month'].apply(season_calc)
pop_small['time_of_day'] = pop_small['hour'].apply(time_of_day)

cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
cols_to_transform = ['drybulb', 'dewpnt', 'year']
pop_dummied = pd.get_dummies(pop_small[cols_use], drop_first = False)
pop_dummied[cols_to_transform] = scaler.transform(pop_dummied[cols_to_transform])
pop_dummied

Unnamed: 0,year,drybulb,dewpnt,time_of_day_afternoon,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_non-working,non_working_working,season_Fall,season_Spring,season_Summer,season_Winter
0,1.603278,0.647329,-1.168091,0,1,0,0,0,1,0,1,0,0
1,0.534426,0.181356,0.446168,0,1,0,0,0,1,1,0,0,0
2,0.801639,-1.164790,-1.319428,0,0,0,1,1,0,1,0,0,0
3,1.603278,-0.077519,-0.108734,0,0,1,0,1,0,1,0,0,0
4,-1.068852,-0.543492,-0.613190,0,0,0,1,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,-1.068852,-0.129293,-0.512298,0,0,1,0,1,0,0,0,0,1
996,1.336065,0.233130,0.294831,0,0,0,1,1,0,0,1,0,0
997,-1.336065,-1.475439,-1.521211,0,1,0,0,0,1,0,0,0,1
998,0.801639,-1.371889,-1.067200,0,0,0,1,1,0,0,0,0,1


In [405]:
predict_demand(popfailure)

ValueError: X has 12 features, but DecisionTreeRegressor is expecting 13 features as input.

In [389]:
FinalGen

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,59.0,53.0,2013,1,0,non-working
1,46.0,35.0,2016,3,23,working
2,35.0,53.0,2015,8,23,working
3,61.0,50.0,2010,3,10,working
4,60.0,22.0,2005,3,22,working
...,...,...,...,...,...,...
995,53.0,34.0,2014,5,3,working
996,60.0,35.0,2015,3,1,working
997,45.0,39.0,2005,5,3,working
998,66.0,43.0,2010,9,6,working


In [92]:
def TrainTestSplit(data, test_size = 0.15, scale = False, cols_to_transform=None, include_test_scale=False):
    
    df = data.copy()
    # get the index after which test set starts
    test_index = int(len(df)*(1-test_size))
    
    # StandardScaler fit on the entire dataset
    if scale and include_test_scale:
        scaler = StandardScaler()
        df[cols_to_transform] = scaler.fit_transform(df[cols_to_transform])
        
    X_train = df.drop('demand', axis = 1).iloc[:test_index]
    y_train = df.demand.iloc[:test_index]
    X_test = df.drop('demand', axis = 1).iloc[test_index:]
    y_test = df.demand.iloc[test_index:]
    
    # StandardScaler fit only on the training set
    if scale and not include_test_scale:
        scaler = StandardScaler()
        X_train[cols_to_transform] = scaler.fit_transform(X_train[cols_to_transform])
        X_test[cols_to_transform] = scaler.transform(X_test[cols_to_transform])
    
    return X_train, X_test, y_train, y_test

In [93]:
# Importing the dataset
df = pd.read_csv('./Data/cleandata/CleanedCT.csv', parse_dates=['ts'], index_col='ts')
df = df.drop(['Unnamed: 0'], axis=1)
# df['non_working'] = df.apply(lambda x: True if \
#                                          ((x['non_working'] == 'non-working'))
#                                          else False, axis = 1)
print('Data Frame Shape:', df.shape)
df.head()

Data Frame Shape: (113976, 14)


Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [94]:
df['day_of_week'] = df['day_of_week'].astype('category')
df['non_working'] = df['non_working'].astype('category')
df['month'] = df['month'].astype('category')
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [95]:

def season(month):
    if month in [6,7,8,9,10]:
        return "summer"
    else:
        return "winter"

In [96]:
df.reset_index(inplace=True)
df.head()

Unnamed: 0,ts,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working
0,2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working
1,2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working
2,2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working
3,2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working
4,2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working


In [97]:
df['season'] = df.ts.dt.month.apply(season)
df = df.set_index(['ts'])
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter


In [98]:
# Dividing the hours into 4 groups-> night, morning, afternoon, evening

hour_dict = {'morning': list(np.arange(7,13)),'afternoon': list(np.arange(13,16)), 'evening': list(np.arange(16,22)),
            'night': [22, 23, 0, 1, 2, 3, 4, 5, 6]}
hour_dict

{'morning': [7, 8, 9, 10, 11, 12],
 'afternoon': [13, 14, 15],
 'evening': [16, 17, 18, 19, 20, 21],
 'night': [22, 23, 0, 1, 2, 3, 4, 5, 6]}

In [99]:
def time_of_day(x):
    if x in hour_dict['morning']:
        return 'morning'
    elif x in hour_dict['afternoon']:
        return 'afternoon'
    elif x in hour_dict['evening']:
        return 'evening'
    else:
        return 'night'


In [100]:
df['time_of_day'] = df['hour'].apply(time_of_day)
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [101]:
# creating categorical columns for linear regression 
cat_cols1 = ['month', 'day_of_year', 'hour', 'day_of_week', 'season', 'holiday', 'non_working', 'time_of_day']
#not including year above to capture the decreasing energy trend over increasing value of years
for col in cat_cols1:
    df[col] = df[col].astype('category')

In [102]:
df['year'] = df['year'].astype('int64')
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [103]:
cols_use = ['demand', 'year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
CT1_lin = pd.get_dummies(df[cols_use], drop_first = True)
print(CT1_lin.shape)
CT1_lin.head()

(113976, 9)


Unnamed: 0_level_0,demand,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_working,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2004-01-01 00:00:00,3126.0,2004,33.0,26.0,0,0,1,0,1
2004-01-01 01:00:00,2945.0,2004,34.0,26.0,0,0,1,0,1
2004-01-01 02:00:00,2804.0,2004,40.0,26.0,0,0,1,0,1
2004-01-01 03:00:00,2729.0,2004,38.0,23.0,0,0,1,0,1
2004-01-01 04:00:00,2722.0,2004,37.0,21.0,0,0,1,0,1


In [104]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

In [105]:
cols_to_transform = ['drybulb', 'dewpnt', 'year']
X_train, X_test, y_train, y_test = TrainTestSplit(CT1_lin, test_size = 0.15, scale = True, cols_to_transform=cols_to_transform, 
                                              include_test_scale=False)

In [106]:
X_train.head()

Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_working,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,-1.580168,-0.949256,-0.660553,0,0,1,0,1
2004-01-01 01:00:00,-1.580168,-0.897289,-0.660553,0,0,1,0,1
2004-01-01 02:00:00,-1.580168,-0.585487,-0.660553,0,0,1,0,1
2004-01-01 03:00:00,-1.580168,-0.689421,-0.811779,0,0,1,0,1
2004-01-01 04:00:00,-1.580168,-0.741388,-0.912596,0,0,1,0,1


In [107]:
# Tuning Random forest
# n_estimators = number of trees in the forest
# max_features = max number of features considered for splitting a node

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(10, 200, 10, endpoint=True)]
max_features = ['auto', 'sqrt']
max_depth = list(range(1,6))
# Create the random grid
random_grid = {'n_estimators': n_estimators, 'max_features': max_features, 'max_depth':max_depth}
print(random_grid)

{'n_estimators': [10, 31, 52, 73, 94, 115, 136, 157, 178, 200], 'max_features': ['auto', 'sqrt'], 'max_depth': [1, 2, 3, 4, 5]}


In [108]:
#import randomsearchcv and time series split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit

# First create the base model to tune
rf = RandomForestRegressor()

# Creating a time series split as discussed in the Introduction
tscv = TimeSeriesSplit(n_splits=5)
# Random search of parameters
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, 
                               cv = tscv, verbose=2, random_state = 42, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

rf_random.best_params_
#rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


{'n_estimators': 73, 'max_features': 'auto', 'max_depth': 5}

# 1. Creating our population

In [109]:
df.head()

Unnamed: 0_level_0,zone,demand,drybulb,dewpnt,date,year,month,hour,day_of_week,day_of_year,weekend,holiday,trend,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2004-01-01 00:00:00,CT,3126.0,33.0,26.0,2004-01-01,2004,1,0,Thu,1,False,True,7344.0,non-working,winter,night
2004-01-01 01:00:00,CT,2945.0,34.0,26.0,2004-01-01,2004,1,1,Thu,1,False,True,7345.0,non-working,winter,night
2004-01-01 02:00:00,CT,2804.0,40.0,26.0,2004-01-01,2004,1,2,Thu,1,False,True,7346.0,non-working,winter,night
2004-01-01 03:00:00,CT,2729.0,38.0,23.0,2004-01-01,2004,1,3,Thu,1,False,True,7347.0,non-working,winter,night
2004-01-01 04:00:00,CT,2722.0,37.0,21.0,2004-01-01,2004,1,4,Thu,1,False,True,7348.0,non-working,winter,night


In [110]:
pop = df.drop(['zone', 'date', 'day_of_week', 'day_of_year', 'weekend', 'holiday', 'trend', 'time_of_day', 'season', 'demand'], axis=1)
#CTtest = CTtest.drop(['index'], axis=1)
pop.head()

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working


In [111]:
scaler = StandardScaler()
cols_to_transform = ['drybulb', 'dewpnt', 'year']
scalar = scaler.fit(pop[cols_to_transform])

## 1.1 Turning our pop into X_train format

In [112]:
pop_forDemandPred = pop.copy()
pop_forDemandPred['season'] = pop_forDemandPred['month'].apply(season)
pop_forDemandPred['time_of_day'] = pop_forDemandPred['hour'].apply(time_of_day)
pop_forDemandPred.head()

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working,season,time_of_day
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working,winter,night
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working,winter,night
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working,winter,night
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working,winter,night
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working,winter,night


In [113]:
cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
pop_forDemandPred = pd.get_dummies(pop_forDemandPred[cols_use], drop_first = True)
print(pop_forDemandPred.shape)
pop_forDemandPred.head()

(113976, 8)


Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_working,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,2004,33.0,26.0,0,0,1,0,1
2004-01-01 01:00:00,2004,34.0,26.0,0,0,1,0,1
2004-01-01 02:00:00,2004,40.0,26.0,0,0,1,0,1
2004-01-01 03:00:00,2004,38.0,23.0,0,0,1,0,1
2004-01-01 04:00:00,2004,37.0,21.0,0,0,1,0,1


In [114]:
pop_forDemandPred[cols_to_transform] = scaler.transform(pop_forDemandPred[cols_to_transform])
print(pop_forDemandPred.shape)
pop_forDemandPred.head()

(113976, 8)


Unnamed: 0_level_0,year,drybulb,dewpnt,time_of_day_evening,time_of_day_morning,time_of_day_night,non_working_working,season_winter
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2004-01-01 00:00:00,-1.603278,-0.95769,-0.663635,0,0,1,0,1
2004-01-01 01:00:00,-1.603278,-0.905916,-0.663635,0,0,1,0,1
2004-01-01 02:00:00,-1.603278,-0.595267,-0.663635,0,0,1,0,1
2004-01-01 03:00:00,-1.603278,-0.698816,-0.814972,0,0,1,0,1
2004-01-01 04:00:00,-1.603278,-0.750591,-0.915863,0,0,1,0,1


In [115]:
rf_random.predict(pop_forDemandPred)

array([2954.00559239, 2954.00559239, 2822.4524446 , ..., 3423.63915934,
       2945.24228601, 2945.24228601])

## 1.2 Creating Predict Demand function

In [116]:
def predict_demand(df):
    output = df.copy()
    pop_forDemandPred = df.copy()

    pop_forDemandPred['season'] = pop_forDemandPred['month'].apply(season)
    pop_forDemandPred['time_of_day'] = pop_forDemandPred['hour'].apply(time_of_day)

    cols_use = ['year', 'time_of_day', 'non_working', 'drybulb', 'dewpnt', 'season']
    cols_to_transform = ['drybulb', 'dewpnt', 'year']
    pop_forDemandPred = pd.get_dummies(pop_forDemandPred[cols_use], drop_first = True)
    pop_forDemandPred[cols_to_transform] = scaler.transform(pop_forDemandPred[cols_to_transform])

    predicted_values = rf_random.predict(pop_forDemandPred)
    output['demand'] = predicted_values

    return output

In [117]:
predict_demand(pop)

Unnamed: 0_level_0,drybulb,dewpnt,year,month,hour,non_working,demand
ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2004-01-01 00:00:00,33.0,26.0,2004,1,0,non-working,2954.005592
2004-01-01 01:00:00,34.0,26.0,2004,1,1,non-working,2954.005592
2004-01-01 02:00:00,40.0,26.0,2004,1,2,non-working,2822.452445
2004-01-01 03:00:00,38.0,23.0,2004,1,3,non-working,2954.005592
2004-01-01 04:00:00,37.0,21.0,2004,1,4,non-working,2954.005592
...,...,...,...,...,...,...,...
2016-12-31 19:00:00,40.0,29.0,2016,12,19,non-working,3423.639159
2016-12-31 20:00:00,41.0,30.0,2016,12,20,non-working,3423.639159
2016-12-31 21:00:00,38.0,32.0,2016,12,21,non-working,3423.639159
2016-12-31 22:00:00,37.0,32.0,2016,12,22,non-working,2945.242286


# Selection

In [118]:
pop_small = pop.sample(1000)
print(pop_small.shape)
pop_small.reset_index(inplace=True, drop=True)
pop_small.head()

(1000, 6)


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,70.0,65.0,2011,5,1,non-working
1,45.0,11.0,2006,3,16,working
2,42.0,32.0,2012,1,20,working
3,75.0,64.0,2014,9,13,working
4,64.0,58.0,2013,9,8,working


In [119]:
popwithDemand = predict_demand(pop_small)
popwithDemand

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand
0,70.0,65.0,2011,5,1,non-working,3270.199799
1,45.0,11.0,2006,3,16,working,3914.744067
2,42.0,32.0,2012,1,20,working,3889.117898
3,75.0,64.0,2014,9,13,working,4403.392502
4,64.0,58.0,2013,9,8,working,3889.117898
...,...,...,...,...,...,...,...
995,47.0,35.0,2013,10,1,working,2686.745057
996,78.0,47.0,2014,7,16,working,4566.458315
997,58.0,51.0,2010,7,7,working,3889.117898
998,29.0,24.0,2008,2,7,working,4542.130701


In [120]:
def fitness(df):
    mu = df['demand'].mean()
    sd = df['demand'].std()

    Fitness = []
    for i in range(len(df)):
        z = ((df['demand'][i]) - mu) / sd
        Fitness.append(z)

    invFitness = []
    for i in range(len(Fitness)):
        x = Fitness[i] * (-1)
        invFitness.append(x)

    NormalFitness = []
    min = np.min(invFitness)
    max = np.max(invFitness)


    for i in range(len(invFitness)):
        n = (invFitness[i] - min) / (max - min)
        NormalFitness.append(n)

    df_Fitness = df.copy()
    df_Fitness['Fitness'] = NormalFitness

    return df_Fitness


In [121]:
eval = fitness(popwithDemand)
eval

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness
0,70.0,65.0,2011,5,1,non-working,3270.199799,0.827741
1,45.0,11.0,2006,3,16,working,3914.744067,0.637447
2,42.0,32.0,2012,1,20,working,3889.117898,0.645013
3,75.0,64.0,2014,9,13,working,4403.392502,0.493179
4,64.0,58.0,2013,9,8,working,3889.117898,0.645013
...,...,...,...,...,...,...,...,...
995,47.0,35.0,2013,10,1,working,2686.745057,1.000000
996,78.0,47.0,2014,7,16,working,4566.458315,0.445035
997,58.0,51.0,2010,7,7,working,3889.117898,0.645013
998,29.0,24.0,2008,2,7,working,4542.130701,0.452218


In [122]:
def roulette_selection(df):
    
    F = df['Fitness'].sum()
    df['SelectionProb'] = df['Fitness'] / F
    df['CumulativeProb'] = df['SelectionProb'].cumsum()

    selectors = np.random.random_sample((len(df),))

    dict_copy = df.to_dict('records')
    selectedChromsIndexes = []

    i = 0
    for selector in selectors:
        for r in dict_copy:
            if (r['CumulativeProb']) > selector:
                selectedChromsIndexes.append(r)
                break

    selected = pd.DataFrame(selectedChromsIndexes)

    return selected

In [123]:
selected = roulette_selection(eval)
selected

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness,SelectionProb,CumulativeProb
0,34.0,12.0,2005,2,12,working,4127.606037,0.574602,0.000828,0.162499
1,53.0,52.0,2004,8,2,working,2695.508363,0.997413,0.001437,0.377177
2,52.0,21.0,2004,2,14,non-working,3423.639159,0.782440,0.001127,0.659026
3,51.0,49.0,2006,3,9,working,3914.744067,0.637447,0.000918,0.715922
4,75.0,62.0,2005,7,22,non-working,4100.726994,0.582537,0.000839,0.480601
...,...,...,...,...,...,...,...,...,...,...
995,84.0,57.0,2011,7,17,non-working,4124.207057,0.575605,0.000829,0.839844
996,36.0,20.0,2010,3,8,working,3889.117898,0.645013,0.000929,0.222851
997,24.0,16.0,2011,2,7,non-working,3805.506540,0.669698,0.000965,0.015730
998,34.0,7.0,2008,1,15,working,4127.606037,0.574602,0.000828,0.244788


# Cross

In [124]:
select = selected.copy()
select['ProbCros'] = np.random.random(len(select))
select['Cross'] = select.apply(lambda x: 1 if (x['ProbCros'] < 0.25) else 0, axis=1)

In [125]:
select.head()

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working,demand,Fitness,SelectionProb,CumulativeProb,ProbCros,Cross
0,34.0,12.0,2005,2,12,working,4127.606037,0.574602,0.000828,0.162499,0.041562,1
1,53.0,52.0,2004,8,2,working,2695.508363,0.997413,0.001437,0.377177,0.306934,0
2,52.0,21.0,2004,2,14,non-working,3423.639159,0.78244,0.001127,0.659026,0.855736,0
3,51.0,49.0,2006,3,9,working,3914.744067,0.637447,0.000918,0.715922,0.104726,1
4,75.0,62.0,2005,7,22,non-working,4100.726994,0.582537,0.000839,0.480601,0.603557,0


In [126]:
import random

In [127]:
def sp_crossover(chrom1, chrom2):
    cross_point = random.randint(1,5)
    c1_l = chrom1[0:cross_point]
    c1_r = chrom1[cross_point:len(chrom1)]

    c2_l = chrom2[0:cross_point]
    c2_r = chrom2[cross_point:len(chrom2)]

    child1 = np.concatenate((c1_l, c2_r))
    child2 = np.concatenate((c2_l, c1_r))

    return child1, child2

In [128]:
def cross(df, crossProb = 0.25):
    select = df.copy()
    select['ProbCros'] = np.random.random(len(select))
    select['Cross'] = select.apply(lambda x: 1 if (x['ProbCros'] < crossProb) else 0, axis=1)
    
    cross = select.loc[select['Cross'] == 1]
    cross = cross.iloc[:, 0:6]
    cross['group'] = 1
    cross['group'].iloc[(int(len(cross) / 2)):] = 2

    if len(cross[cross['group'] == 1]) != len(cross[cross['group'] == 2]):
        cross = cross.iloc[:-1 , :]
    cross.drop(['group'], inplace=True, axis=1)

    group1 = cross.iloc[:(int(len(cross) / 2))].to_numpy()
    group2 = cross.iloc[(int(len(cross) / 2)):].to_numpy()

    crossedGen = []
    for i, j in zip(group1, group2):
        children = sp_crossover(i, j)
        crossedGen.append(children[0])
        crossedGen.append(children[1])

    cross['children'] = crossedGen
    cross[:] = cross.pop('children').to_list()

    for i, j in cross.iterrows():
        select.iloc[i, 0:6] = j

    return (select.iloc[:, :6])

In [129]:
crossed = cross(selected)
print('Are the selected and crossed the same?:', selected.equals(crossed))
crossed

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Are the selected and crossed the same?: False


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,34.0,12.0,2008,9,19,working
1,53.0,52.0,2004,8,2,working
2,65.0,53.0,2005,2,12,working
3,51.0,49.0,2006,3,9,working
4,75.0,62.0,2005,7,22,non-working
...,...,...,...,...,...,...
995,84.0,57.0,2011,7,17,non-working
996,36.0,20.0,2010,3,8,working
997,24.0,16.0,2011,2,7,non-working
998,34.0,7.0,2008,1,15,working


# Mutation

In [130]:
def mutate(df, mutation_rate=0.01):
    newTEST = df.copy()

    drybulbMutated = []
    for i in newTEST['drybulb'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['drybulb'].min()), (df['drybulb'].max() +1))
            drybulbMutated.append(x)
        else:
            k = i
            drybulbMutated.append(k)

    newTEST['drybulb'] = drybulbMutated

    #dewpnt Mutation
    dewpntMutated = []
    for i in newTEST['dewpnt'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['dewpnt'].min()), (df['dewpnt'].max() +1))
            dewpntMutated.append(x)
        else:
            k = i
            dewpntMutated.append(k)

    newTEST['dewpnt'] = dewpntMutated

    # Year Mutation
    yearMutated = []
    for i in newTEST['year'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['year'].min()), (df['year'].max() +1))
            yearMutated.append(x)
        else:
            k = i
            yearMutated.append(k)

    newTEST['year'] = yearMutated

    # Month Mutation
    monthMutated = []
    for i in newTEST['month'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['month'].min()), (df['month'].max() +1))
            monthMutated.append(x)
        else:
            k = i
            monthMutated.append(k)

    newTEST['month'] = monthMutated

    # Hour Mutation
    hourMutated = []
    for i in newTEST['hour'].values:
        if (np.random.random()) < mutation_rate:
            x = np.random.randint((df['hour'].min()), (df['hour'].max() +1))
            hourMutated.append(x)
        else:
            k = i
            hourMutated.append(k)

    newTEST['hour'] = hourMutated

    # non-working Mutation
    nonWorkingMutated = []
    for i in crossed['non_working'].values:
        if (np.random.random()) < mutation_rate:
            if i == 'working':
                x = 'non-working'
                nonWorkingMutated.append(x)
            else:
                x = 'working'
                nonWorkingMutated.append(x)
        else:
            k = i
            nonWorkingMutated.append(k)

    newTEST['non_working'] = nonWorkingMutated

    return newTEST


In [131]:
mutated = mutate(crossed)
print('Are the crossed and mutated the same?:', crossed.equals(mutated))
mutated

Are the crossed and mutated the same?: False


Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,34.0,12.0,2008,9,19,working
1,53.0,52.0,2004,8,2,working
2,65.0,53.0,2004,2,12,working
3,51.0,49.0,2006,3,9,working
4,75.0,62.0,2005,7,22,non-working
...,...,...,...,...,...,...
995,84.0,57.0,2011,7,17,non-working
996,36.0,20.0,2010,3,8,working
997,24.0,16.0,2011,2,7,non-working
998,34.0,7.0,2008,1,15,working


# Putting all together

In [132]:
def GA(df, NumOfGenerations= 50):
    pop = df.copy()
    for i in range(NumOfGenerations):
        popwithDemand = predict_demand(pop)
        eval = fitness(popwithDemand)
        selected = roulette_selection(eval)
        crossed = cross(selected)
        mutated = mutate(crossed)
        pop = mutated.copy()

    return pop

In [139]:
FinalGen = GA(pop_small, NumOfGenerations=50)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_

ValueError: X has 7 features, but DecisionTreeRegressor is expecting 8 features as input.

In [138]:
FinalGen

Unnamed: 0,drybulb,dewpnt,year,month,hour,non_working
0,45.0,31.0,2012,1,22,working
1,55.0,55.0,2016,4,0,working
2,52.0,45.0,2005,3,2,working
3,32.0,30.0,2015,8,1,working
4,52.0,51.0,2011,5,1,non-working
...,...,...,...,...,...,...
995,56.0,65.0,2007,1,3,non-working
996,55.0,38.0,2014,5,5,working
997,55.0,55.0,2009,5,23,non-working
998,62.0,38.0,2010,3,22,non-working
