In [1]:
import warnings
warnings.filterwarnings('ignore')

In [16]:
import pandas as pd
import numpy as np
import operator

from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn import cross_validation, metrics
from sklearn.grid_search import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline

import calendar as cal
from IPython.core.display import HTML

import datetime

In [3]:
min_max = MinMaxScaler()

In [4]:
df = pd.read_csv("https://s3.us-east-2.amazonaws.com/ads-demo1/E_Dataset.csv",parse_dates=['date'])

In [5]:
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.quarter
df['month'] = df['date'].dt.month
df['weekNumberInYear'] = df['date'].dt.week
df['dayOfMonth'] = df['date'].dt.day
df['dayInWeek'] = df['date'].dt.dayofweek.apply(lambda x : cal.day_name[x])
df['hourOfDay'] = df['date'].dt.hour
df['minute'] = df['date'].dt.minute

df['WeekendFlag'] = df['dayInWeek'].apply(lambda x : 1 if (x == 'Saturday') or (x == 'Sunday') else 0).astype('int64')
df['TotalSecondsMidnight'] = (df['date'].dt.hour * 3600) + (df['date'].dt.minute * 60) + (df['date'].dt.second)

dayInWeek = pd.get_dummies(df.dayInWeek,prefix='dayInWeek').astype('int64')
weekendflag = pd.get_dummies(df.WeekendFlag,prefix='weekendflag').astype('int64')

df = pd.concat([df,dayInWeek,weekendflag],axis=1)

In [6]:
display(HTML(df.head().to_html()))

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,T6,RH_6,T7,RH_7,T8,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2,year,quarter,month,weekNumberInYear,dayOfMonth,dayInWeek,hourOfDay,minute,WeekendFlag,TotalSecondsMidnight,dayInWeek_Friday,dayInWeek_Monday,dayInWeek_Saturday,dayInWeek_Sunday,dayInWeek_Thursday,dayInWeek_Tuesday,dayInWeek_Wednesday,weekendflag_0,weekendflag_1
0,2016-01-11 17:00:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,7.026667,84.256667,17.2,41.626667,18.2,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433,2016,1,1,2,11,Monday,17,0,0,61200,0,1,0,0,0,0,0,1,0
1,2016-01-11 17:10:00,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,6.833333,84.063333,17.2,41.56,18.2,48.863333,17.066667,45.56,6.483333,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195,2016,1,1,2,11,Monday,17,10,0,61800,0,1,0,0,0,0,0,1,0
2,2016-01-11 17:20:00,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,6.56,83.156667,17.2,41.433333,18.2,48.73,17.0,45.5,6.366667,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668,2016,1,1,2,11,Monday,17,20,0,62400,0,1,0,0,0,0,0,1,0
3,2016-01-11 17:30:00,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,6.433333,83.423333,17.133333,41.29,18.1,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.410389,45.410389,2016,1,1,2,11,Monday,17,30,0,63000,0,1,0,0,0,0,0,1,0
4,2016-01-11 17:40:00,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,6.366667,84.893333,17.2,41.23,18.1,48.59,17.0,45.4,6.133333,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097,2016,1,1,2,11,Monday,17,40,0,63600,0,1,0,0,0,0,0,1,0


In [7]:
featureColumns = ['Appliances', 'TotalSecondsMidnight', 'lights', 'T1', 'RH_1', 'T2', 'RH_2', 'T3', 'RH_3', 'T4', 'RH_4', 'T5', 'RH_5',
                  'T6', 'RH_6', 'T7', 'RH_7', 'T8', 'RH_8', 'T9', 'RH_9', 'T_out', 'RH_out', 'Visibility', 'Windspeed', 'Tdewpoint',
                  'Press_mm_hg', 'weekendflag_1', 'weekendflag_0','dayInWeek_Monday', 'dayInWeek_Tuesday', 'dayInWeek_Wednesday', 'dayInWeek_Thursday',
                  'dayInWeek_Friday', 'dayInWeek_Saturday', 'dayInWeek_Sunday']

In [8]:
df = df[featureColumns]

In [9]:
X = df.drop(['Appliances'], axis=1)
y = df['Appliances']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=101)

In [11]:
error_metric = pd.DataFrame({'r2_train': [],
                             'r2_test': [],
                             'rms_train': [],
                             'rms_test': [],
                             'mae_train': [],
                             'mae_test': [],
                             'mape_train': [],
                             'mape_test': []})

rmse_dict = {}


def calc_error_metric(modelname, model, X_train_scale, y_train, X_test_scale, y_test):
    global error_metric
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)

    # MAE, RMS, MAPE, R2

    r2_train = r2_score(y_train, y_train_predicted)
    r2_test = r2_score(y_test, y_test_predicted)

    rms_train = sqrt(mean_squared_error(y_train, y_train_predicted))
    rms_test = sqrt(mean_squared_error(y_test, y_test_predicted))

    mae_train = mean_absolute_error(y_train, y_train_predicted)
    mae_test = mean_absolute_error(y_test, y_test_predicted)

    mape_train = np.mean(np.abs((y_train - y_train_predicted) / y_train)) * 100
    mape_test = np.mean(np.abs((y_test - y_test_predicted) / y_test)) * 100

    rmse_dict[modelname] = rms_test

    df_local = pd.DataFrame({'Model': [modelname],
                             'r2_train': [r2_train],
                             'r2_test': [r2_test],
                             'rms_train': [rms_train],
                             'rms_test': [rms_test],
                             'mae_train': [mae_train],
                             'mae_test': [mae_test],
                             'mape_train': [mape_train],
                             'mape_test': [mape_test]})

    error_metric = pd.concat([error_metric, df_local])
    return error_metric

In [12]:
pipe_lr = Pipeline([('scl', StandardScaler()), 
                    ('clf', LinearRegression(normalize=True))])
grid_params_lr = [{}]
gs_lr = GridSearchCV(estimator=pipe_lr, param_grid=grid_params_lr, cv=5)
gs_lr.fit(X_train, y_train)
calc_error_metric('Regression', gs_lr, X_train, y_train, X_test, y_test)
print('Regression Pipeline Finished')


pipe_rf = Pipeline([('scl', StandardScaler()), 
                    ('rf', RandomForestRegressor(n_estimators=30,max_depth=30))])
grid_params_rf = [{}]
gs_rf = GridSearchCV(estimator=pipe_rf, param_grid=grid_params_rf, cv=5)
gs_rf.fit(X_train, y_train)
calc_error_metric('RandomForest', gs_rf, X_train, y_train, X_test, y_test)
print('Random Forrest Pipeline Finished')


pipe_nn = Pipeline([('min/max scaler', MinMaxScaler()),
                    ('neural network', MLPRegressor(activation='relu', 
                                                    alpha=0.05, learning_rate='constant',solver='adam'))])
grid_params_nn = [{}]
gs_nn = GridSearchCV(estimator=pipe_nn, param_grid=grid_params_nn, cv=5)
gs_nn.fit(X_train, y_train)
calc_error_metric('Nueral Network', gs_nn, X_train, y_train, X_test, y_test)
print('Neural Network Pipeline Finished')

Regression Pipeline Finished
Random Forrest Pipeline Finished
Neural Network Pipeline Finished


In [13]:
best_model = min(rmse_dict.items(), key=operator.itemgetter(1))[0]
print('Best Model is ', best_model)

Best Model is  RandomForest


In [15]:
error_metric.to_csv('Pipelining_Error_metrics.csv')
error_metric

Unnamed: 0,Model,mae_test,mae_train,mape_test,mape_train,r2_test,r2_train,rms_test,rms_train
0,Regression,53.660736,52.535485,62.577428,60.324042,0.178561,0.171841,93.161672,93.216941
0,RandomForest,32.938484,12.669975,32.731314,12.647357,0.531614,0.931181,70.347999,26.871533
0,Nueral Network,53.704585,52.378773,62.607551,60.426709,0.186706,0.18897,92.698647,92.247899
