In [1]:
import pandas as pd
import statsmodels.formula.api as smf
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np
from datetime import timedelta
import random


pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', 500)

In [2]:
df = pd.read_csv("more_data_action_energy_linear_extratrain_0.csv")

In [3]:
df.drop(columns={"Unnamed: 0"}, inplace=True)

In [4]:
# Reduces size of raw data to 10%
df = df.iloc[:int(len(df) * 1)]

In [5]:
cbh = pd.offsets.CustomBusinessHour(start='08:00', 
                                    end='18:00',
                                    weekmask = 'Mon Tue Wed Thu Fri Sat Sun')

In [6]:
df["Timestamp"] = pd.date_range(start = pd.Timestamp("2018-09-20T08"),freq=cbh, periods = len(df))

In [7]:
df["Date"] = [row.date() for row in df["Timestamp"]]

In [8]:
df["Hour"] = [row.hour for row in df["Timestamp"]]

In [9]:
df["Day of Week"] = pd.to_datetime(df["Date"]).dt.day_name()

In [10]:
# include below line if you are using Jupyter Notebook
%matplotlib inline

# Set figure width to 12 and height to 9
plt.rcParams['figure.figsize'] = [12, 16]

# Data Preparation

'Baseline' - calculate from three previous day of weeks and time
'v_learning'? Brainstorm a way around this pulling model back to the observed, office might not react the same way vicarious learning assumes when training.
'weekly_poll' 0-5, has a noisy effect of decreasing energy usage throughout the day (1 is least likely to save and 5 is most likely)
'pre_survey' 0-5, has a noisy effect of decreasing energy usage throughout the day (1 is least likely to save and 5 is most likely)
'points' - done
'email_indicator' - 0 or 1 (one has effect of reducing, modify energy values if value is 1)
'out_of_office' - 0 or 1 (one has effect of reducing, modify energy values if value is 1)
'Timestamp' - done

## Out Of Office

In [11]:
df_dates_shuffled = list(set(df.Date))
random.shuffle(df_dates_shuffled)

In [12]:
df_dates_shuffled = df_dates_shuffled[:20]

In [13]:
df["Out_of_Office"] = 0
for date in df_dates_shuffled:
    df.loc[df.Date == date, "Out_of_Office"] = 1

In [14]:
df["Energy"] = df["Energy"]*(1-df["Out_of_Office"]) + np.random.normal(20, 1, len(df))*df["Out_of_Office"]

## Presurvey

In [15]:
rand_1 = round(random.uniform(0, 5), 2)
rand_2 = round(random.uniform(0, 5), 2)

for index, row in df.iterrows():
    if index > len(df) / 2:
        df.loc[index, "Presurvey"] = rand_1
    else:
        df.loc[index, "Presurvey"] = rand_2

In [16]:
df["Energy"] -= np.random.poisson(df["Presurvey"]*10, len(df))

## Weekly Survey

In [17]:
length_day = 10
init_offset = 7 - pd.to_datetime(df.loc[0]["Date"]).weekday()
weekly_survey = []

first = [round(random.uniform(0, 5), 2)]*(length_day*init_offset)
weekly_survey.extend(first)

while len(weekly_survey) < (len(df) - 7*length_day):
    weekly_survey.extend([round(random.uniform(0, 5), 2)]*70)
    
last = [round(random.uniform(0, 5), 2)]*(len(df) - len(weekly_survey))
weekly_survey.extend(last)

df["Weekly_Survey"] = weekly_survey

In [18]:
df["Energy"] -= np.random.poisson(df["Weekly_Survey"]*5, len(df))

## Baseline

In [19]:
social_game_df = pd.read_csv("HourlyDataCleanedFinal.csv")
social_game_df.Date = pd.to_datetime(social_game_df.Date)

In [20]:
def get_hourly_in_game_baseline(df, timestamp):
    
    baseline_times = []
    energy = []

    for _ in range(3):
        timestamp = timestamp - timedelta(weeks=1)
        baseline_times.append((timestamp.date(), timestamp.hour))
    
    for time in baseline_times:
        if(len(df[(df["Date"] == time[0]) & (df["Hour"] == time[1])]["Energy"]) == 0):
            window = np.nan
        else:
            window = df[(df["Date"] == time[0]) & (df["Hour"] == time[1])]["Energy"].iloc[0]
            
        energy.append(window)

    return np.nanmean(energy)

In [21]:
def get_hourly_before_game_baseline(social_game_df, timestamp):
    day_of_week = timestamp.day_name()
    
    social_game_df = social_game_df[(social_game_df["Date"] <= pd.Timestamp('2018-06-29'))].groupby(
        ["Date", "Hour"])["HourlyEnergy"].mean().reset_index()
    social_game_df["Day of Week"] = social_game_df["Date"].dt.day_name()
    
    social_game_df = social_game_df.iloc[1:505]
    social_game_df_subset = social_game_df[(social_game_df["Hour"] == timestamp.hour) 
                                           & (social_game_df["Day of Week"] == day_of_week)]
    
    return np.mean(social_game_df_subset["HourlyEnergy"])

In [22]:
df["Before_Game_Baseline"] = [get_hourly_before_game_baseline(social_game_df, row["Timestamp"]) 
                              for index, row in df.iterrows()]

In [23]:
df["Energy"] += [np.random.normal(row / 10, 1, 1)[0] for row in df["Before_Game_Baseline"]]

In [None]:
df["In_Game_Baseline"] = [get_hourly_in_game_baseline(df, row["Timestamp"]) 
                              for index, row in df.iterrows()]
df["Energy"] = df["Energy"].add([np.random.normal(row / 10, 1, 1)[0] for row in df["In_Game_Baseline"]], 
                                fill_value=0)



## Final Tune

In [None]:
df["Energy"] = np.maximum(0, df["Energy"])

In [None]:
df.tail()

In [None]:
df.to_csv(r'/Users/akaash/Desktop/Dynamic_Model_Data/simulation_data_v2.csv', index = None, header=True)

# Graphing

In [None]:
cycle, trend = sm.tsa.filters.hpfilter(df["Energy"], 50)
fig, ax = plt.subplots(4,1)
ax[0].plot(df["Energy"])
ax[0].set_title('Energy')
ax[1].plot(df["Point"])
ax[1].set_title('Point')
ax[2].set_title('Trend')
ax[2].plot(trend)
ax[3].plot(cycle)
ax[3].set_title('Cycle')
plt.show()

# Prediction

In [None]:
df_train = df.iloc[:int(len(df) * 0.7)]
df_test = df.iloc[int(len(df) * 0.7):]

In [None]:
ols_mod_train = smf.ols(formula = "(Energy ~ Point + Out_of_Office + Presurvey + Weekly_Survey + Before_Game_Baseline + In_Game_Baseline)",
                 data = df_train)
ols_res = ols_mod_train.fit()
ols_res.summary()

In [None]:
prediction = ols_res.predict(df_test)

In [None]:
prediction = np.maximum(0, prediction)

In [None]:
%matplotlib inline
plt.plot(prediction)
plt.plot(df_test["Energy"])

In [None]:
np.sqrt(np.mean((prediction - df_test["Energy"])**2))