# Offline Policy Evaluation w/ minibatch

Testing offline policy evalation with zone-agnostic behavior policy model

In [1]:
import numpy as np
import sys
import matplotlib.pyplot as plt
import pandas as pd
import pickle
sys.path.insert(0, "../")
import torch
from ppo import PPO
import glob
from tqdm.notebook import tqdm
import json
from scipy.stats import spearmanr
import importlib
import traceback

%matplotlib widget

In [2]:
# log_data = pd.read_csv("../data/rule_based_log_data/flexlab/0_cleaned_log_flexlab.csv")
# with open("../data/rule_based_log_data/flexlab/action_probs_all_data.pkl", "rb") as f:
#     behavior_model = pickle.load(f)
# with open("../data/invalid_policy_list_20220705.json") as f:
#     invalid_policies = json.load(f)["invalid_policies"]
# # invalid_policies = []
log_data = pd.read_csv("../data/rule_based_log_data/15zone/0_cleaned_log.csv")
with open("../data/rule_based_log_data/15zone/action_probs_all_data.pkl", "rb") as f:
    behavior_model = pickle.load(f)
# with open("../data/invalid_policy_list_20220705.json") as f:
#     invalid_policies = json.load(f)["invalid_policies"]
invalid_policies = []

## 30 Day MiniBatch

In [3]:
num_ts_per_day = 4 * 24
num_days = 15
ts_end = num_ts_per_day * num_days
zones = log_data["zone"].unique()

## Loading all Policies

In [4]:
policy_list = sorted(list(glob.glob(f"../policy_library_20220820/**.pth")))

### 1. Inverse Probability Weighting

In [5]:
from ope.iw import InverseProbabilityWeighting
policy_scores = {}
for zone in zones:
    print(zone)
    ope_data = log_data[log_data["zone"] == zone].sort_values(by=["timestep"])
    ope_data = log_data[:ts_end]
    ipw = InverseProbabilityWeighting(ope_data, retain_grad_fn=False, univariate_action=True)
    for policy in tqdm(policy_list):
        agent = PPO(6, 1, 0.003, 0.0005, 1, 10, 0.2,
                    has_continuous_action_space=True, action_std_init=0.2, 
                    device=torch.device('cpu'), diverse_policies=list(),
                    diverse_weight=0, diverse_increase=True)
        agent.load(policy)
        agent.policy_evaluation = True
        agent.policy_old.set_action_std(0.1)
        if policy not in invalid_policies:
            score, _, _, _, _ = ipw.evaluate_policy(agent.select_action, behavior_model, score="mean")
        else:
            continue
        if policy not in policy_scores:
            policy_scores[policy] = {}
        if zone not in policy_scores[policy]:
            policy_scores[policy][zone] = score.item()

Perimeter_top_ZN_3


  0%|          | 0/870 [00:00<?, ?it/s]

  return np.log(value / (1- value))


Perimeter_bot_ZN_1


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_top_ZN_1


  0%|          | 0/870 [00:00<?, ?it/s]

Core_bottom


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_top_ZN_4


  0%|          | 0/870 [00:00<?, ?it/s]

Core_top


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_bot_ZN_2


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_bot_ZN_3


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_mid_ZN_4


  0%|          | 0/870 [00:00<?, ?it/s]

Core_mid


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_mid_ZN_2


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_mid_ZN_3


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_top_ZN_2


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_mid_ZN_1


  0%|          | 0/870 [00:00<?, ?it/s]

Perimeter_bot_ZN_4


  0%|          | 0/870 [00:00<?, ?it/s]

In [6]:
eval_data_loc = "../data/eval_data/evaluation_report_20220820.csv"
df = pd.read_csv(eval_data_loc, header=None, names=["datetime","policy","zone","energy"])
spearman_corr = {}
for zone in zones:
    eval_df = df[df["zone"] == zone]

    # invalid_policies = list(set(policy_list) - set(policy_scores.keys()))
    for i_policy in invalid_policies:
        eval_df = eval_df[eval_df["policy"]!=i_policy[3:]]
    eval_df = eval_df.sort_values(by=["energy"])
    score_list = []
    for i, row in eval_df.iterrows():
        score_list.append(policy_scores[f"../{row['policy']}"][zone])

    eval_df["ope_scores"] = score_list
    correlation = spearmanr(eval_df["energy"].values, eval_df["ope_scores"].values)
    spearman_corr[zone] = correlation

In [9]:
with open(f"data/15zone/ipw/raw_scores/ipw_raw_scores_{num_days}_days_18_09_2022.pkl", "wb+") as f:
    pickle.dump(policy_scores, f)

In [8]:
with open(f"data/15zone/ipw/spearman_corr/ipw_spearman_corr_{num_days}_days_18_09_2022.pkl", "wb+") as f:
    pickle.dump(spearman_corr, f)

### 2. Self Normalized Inverse Probability Weighting

In [10]:
from ope.sniw import SelfNormalizedInverseProbabilityWeighting
policy_scores_sniw = {}
for zone in zones:
    print(zone)
    ope_data = log_data[log_data["zone"] == zone].sort_values(by=["timestep"])
    ope_data = log_data[:ts_end]
    snipw = SelfNormalizedInverseProbabilityWeighting(ope_data)
    for policy in tqdm(policy_list):
        agent = PPO(6, 1, 0.003, 0.0005, 1, 10, 0.2,
                    has_continuous_action_space=True, action_std_init=0.2, 
                    device=torch.device('cpu'), diverse_policies=list(),
                    diverse_weight=0, diverse_increase=True)
        agent.load(policy)
        agent.policy_evaluation = True
        agent.policy_old.set_action_std(0.1)
        if policy not in invalid_policies:
            score = snipw.evaluate_policy(agent.select_action, behavior_model, score="mean")
        else:
            continue
        if policy not in policy_scores_sniw:
            policy_scores_sniw[policy] = {}
        if zone not in policy_scores_sniw[policy]:
            policy_scores_sniw[policy][zone] = score.item()

FlexLab-X3-ZoneB


  0%|          | 0/400 [00:00<?, ?it/s]

FlexLab-X3-ZoneA


  0%|          | 0/400 [00:00<?, ?it/s]

In [10]:
eval_data_loc = "../data/evaluation_clean_20220705.csv"
df = pd.read_csv(eval_data_loc, header=None, names=["datetime","policy","zone","energy"])
spearman_corr_sniw = {}
for zone in zones:
    eval_df = df[df["zone"] == zone]
    for i_policy in invalid_policies:
        eval_df = eval_df[eval_df["policy"]!=i_policy[3:]]
    eval_df = eval_df.sort_values(by=["energy"])
    score_list = []
    for i, row in eval_df.iterrows():
        score_list.append(policy_scores_sniw[f"../{row['policy']}"][zone])

    eval_df["ope_scores"] = score_list
    correlation = spearmanr(eval_df["energy"].values, eval_df["ope_scores"].values, nan_policy="omit")
    spearman_corr_sniw[zone] = correlation

In [11]:
spearman_corr_sniw

{'Perimeter_top_ZN_3': SpearmanrResult(correlation=-0.38242011301619144, pvalue=4.0002137784925087e-07),
 'Perimeter_bot_ZN_1': SpearmanrResult(correlation=-0.37303948025005756, pvalue=8.029194038090729e-07),
 'Perimeter_top_ZN_1': SpearmanrResult(correlation=-0.3511063556967383, pvalue=3.767279517528943e-06),
 'Core_bottom': SpearmanrResult(correlation=0.32323701815667455, pvalue=2.2882161595134347e-05),
 'Perimeter_top_ZN_4': SpearmanrResult(correlation=-0.14194002315097604, pvalue=0.06897085022040791),
 'Core_top': SpearmanrResult(correlation=0.3467507278919247, pvalue=5.052443577911819e-06),
 'Perimeter_bot_ZN_2': SpearmanrResult(correlation=-0.21709598009839076, pvalue=0.005095506175888897),
 'Perimeter_bot_ZN_3': SpearmanrResult(correlation=-0.5044605286441938, pvalue=4.901921756134198e-12),
 'Perimeter_mid_ZN_4': SpearmanrResult(correlation=0.20648615175543827, pvalue=0.0077923416083663126),
 'Core_mid': SpearmanrResult(correlation=0.3385899381275683, pvalue=8.654950124831173e-0

In [11]:
with open(f"data/flexlab/snipw/raw_scores/{num_days}_days_24_08_2022.pkl", "wb+") as f:
    pickle.dump(policy_scores_sniw, f)

In [13]:
with open(f"data/snipw_spearman_corr_{num_days}_days_19_07_2022_new_policies.pkl", "wb+") as f:
    pickle.dump(spearman_corr_sniw, f)

In [14]:
spearman_corr_sniw

{'Perimeter_top_ZN_3': SpearmanrResult(correlation=-0.38242011301619144, pvalue=4.0002137784925087e-07),
 'Perimeter_bot_ZN_1': SpearmanrResult(correlation=-0.37303948025005756, pvalue=8.029194038090729e-07),
 'Perimeter_top_ZN_1': SpearmanrResult(correlation=-0.3511063556967383, pvalue=3.767279517528943e-06),
 'Core_bottom': SpearmanrResult(correlation=0.32323701815667455, pvalue=2.2882161595134347e-05),
 'Perimeter_top_ZN_4': SpearmanrResult(correlation=-0.14194002315097604, pvalue=0.06897085022040791),
 'Core_top': SpearmanrResult(correlation=0.3467507278919247, pvalue=5.052443577911819e-06),
 'Perimeter_bot_ZN_2': SpearmanrResult(correlation=-0.21709598009839076, pvalue=0.005095506175888897),
 'Perimeter_bot_ZN_3': SpearmanrResult(correlation=-0.5044605286441938, pvalue=4.901921756134198e-12),
 'Perimeter_mid_ZN_4': SpearmanrResult(correlation=0.20648615175543827, pvalue=0.0077923416083663126),
 'Core_mid': SpearmanrResult(correlation=0.3385899381275683, pvalue=8.654950124831173e-0