In [1]:
import numpy as np
import sys
import matplotlib.pyplot as plt
import pandas as pd
import pickle
sys.path.insert(0, "../")
import torch
from ppo import PPO
import glob
from tqdm.notebook import tqdm
import json
from scipy.stats import spearmanr
import importlib
from obp.ope import (
    ContinuousOffPolicyEvaluation,
    KernelizedInverseProbabilityWeighting,
    KernelizedSelfNormalizedInverseProbabilityWeighting,\
)

%matplotlib widget

In [2]:
log_data = pd.read_csv("../data/rule_based_log_data/0_cleaned_log.csv")
with open("../data/rule_based_log_data/action_probs_all_data.pkl", "rb") as f:
    behavior_model = pickle.load(f)
with open("../data/invalid_policy_list.json") as f:
    invalid_policies = json.load(f)["invalid_policies"]

In [3]:
num_ts_per_day = 4 * 24
num_days = 30
ts_end = num_ts_per_day * num_days
zones = log_data["zone"].unique()

In [4]:
policy_list = sorted(list(glob.glob(f"../policy_library/**.pth")))

In [5]:
policy_scores = {}
for zone in zones:
    print(zone)
    ope_data = log_data[log_data["zone"] == zone].sort_values(by=["timestep"])
    ope_data = log_data[:ts_end]
    states = []
    actions = []
    rewards = []
    for i, row in ope_data.iterrows():
        state_vars = ["outdoor_temp", "solar_irradiation", "time_hour",
                      "zone_humidity", "zone_temp", "zone_occupancy"]
        state = [row[var] for var in state_vars]
        action = row["action"]
        reward = row["reward"]
        states.append(state)
        rewards.append(reward)
        actions.append(action)
        ope = ContinuousOffPolicyEvaluation(bandit_feedback=
                                            {"action": np.array(actions),
                                             "reward": np.array(rewards),
                                             "pscore": np.ones((len(ope_data)))},
                                            ope_estimators=[KernelizedSelfNormalizedInverseProbabilityWeighting(kernel="epanechnikov", bandwidth=0.02)])

    for policy in tqdm(policy_list):
        agent = PPO(6, 1, 0.003, 0.0005, 1, 10, 0.2,
                    has_continuous_action_space=True, action_std_init=0.2, 
                    device=torch.device('cpu'), diverse_policies=list(),
                    diverse_weight=0, diverse_increase=True)
        agent.load(policy)
        agent.policy_evaluation = False
        agent.policy_old.set_action_std(0.1)
        
        # probs = torch.exp(agent.buffer.logprobs[0].reshape(-1, 1))
        if policy not in invalid_policies:
            # score, _, _, _, _ = ipw.evaluate_policy(agent.select_action, behavior_model, score="mean")
            eval_actions = torch.Tensor(agent.select_action(states)).sigmoid()
            estimated_value = ope.estimate_policy_values(action_by_evaluation_policy=eval_actions.numpy())
            # print(estimated_value)
        else:
            continue
        if policy not in policy_scores:
            policy_scores[policy] = {}
        if zone not in policy_scores[policy]:
            policy_scores[policy][zone] = estimated_value["kernelized_snipw"]

Perimeter_top_ZN_3


  0%|          | 0/800 [00:00<?, ?it/s]

  estimated_rewards /= (kernel_func(u) / pscore).mean()


Perimeter_bot_ZN_1


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_top_ZN_1


  0%|          | 0/800 [00:00<?, ?it/s]

Core_bottom


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_top_ZN_4


  0%|          | 0/800 [00:00<?, ?it/s]

Core_top


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_bot_ZN_2


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_bot_ZN_3


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_mid_ZN_4


  0%|          | 0/800 [00:00<?, ?it/s]

Core_mid


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_mid_ZN_2


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_mid_ZN_3


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_top_ZN_2


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_mid_ZN_1


  0%|          | 0/800 [00:00<?, ?it/s]

Perimeter_bot_ZN_4


  0%|          | 0/800 [00:00<?, ?it/s]

In [6]:
eval_data_loc = "../data/1month_eval.csv"
df = pd.read_csv(eval_data_loc, header=None, names=["datetime","policy","zone","energy"])
spearman_corr = {}
for zone in zones:
    eval_df = df[df["zone"] == zone]

    # invalid_policies = list(set(policy_list) - set(policy_scores.keys()))
    for i_policy in invalid_policies:
        eval_df = eval_df[eval_df["policy"]!=i_policy[3:]]
    eval_df = eval_df.sort_values(by=["energy"])
    score_list = []
    for i, row in eval_df.iterrows():
        score_list.append(policy_scores[f"../{row['policy']}"][zone])

    eval_df["ope_scores"] = score_list
    correlation = spearmanr(eval_df["energy"].values, eval_df["ope_scores"].values, nan_policy="omit")
    spearman_corr[zone] = correlation
spearman_corr

{'Perimeter_top_ZN_3': SpearmanrResult(correlation=0.13244935692603602, pvalue=0.05082421253470562),
 'Perimeter_bot_ZN_1': SpearmanrResult(correlation=-0.10095176399956175, pvalue=0.13643284096914582),
 'Perimeter_top_ZN_1': SpearmanrResult(correlation=-0.033776800886004085, pvalue=0.6166774345856499),
 'Core_bottom': SpearmanrResult(correlation=0.3217096619569294, pvalue=1.6212967714298676e-06),
 'Perimeter_top_ZN_4': SpearmanrResult(correlation=0.23780066279442683, pvalue=0.0005099398806844889),
 'Core_top': SpearmanrResult(correlation=0.31311809129062573, pvalue=2.9870726863053575e-06),
 'Perimeter_bot_ZN_2': SpearmanrResult(correlation=0.20273302497007692, pvalue=0.0028885173389176567),
 'Perimeter_bot_ZN_3': SpearmanrResult(correlation=-0.09316377788136092, pvalue=0.167547077123327),
 'Perimeter_mid_ZN_4': SpearmanrResult(correlation=0.29438788268495697, pvalue=1.0856126728301105e-05),
 'Core_mid': SpearmanrResult(correlation=0.26143823260011084, pvalue=8.70427128807221e-05),
 'P