# Discrete Offline Policy Evaluation

Testing standard discrete offline policy evaluation methods

In [None]:
import sys
import pandas as pd
import pickle
sys.path.insert(0, "../")
import torch
from ppo import PPO
import glob
from tqdm.notebook import tqdm
from scipy.stats import spearmanr

%matplotlib widget

In [None]:
log_data = pd.read_csv("../data/rule_based_log_data/<name_of_building>/0_cleaned_log.csv")
with open("../data/rule_based_log_data/<name_of_building>/action_probs_all_data.pkl", "rb") as f:
    behavior_model = pickle.load(f)
invalid_policies = []

In [None]:
# Defining the mini-batch size
num_ts_per_day = 4 * 24
num_days = 15
ts_end = num_ts_per_day * num_days
zones = log_data["zone"].unique()

In [None]:
# Reading policies from the policy library
policy_list = sorted(list(glob.glob(f"../policy_library_20220820/**.pth")))

### 1. Inverse Probability Weighting

In [None]:
from ope.iw import InverseProbabilityWeighting
policy_scores = {}
for zone in zones:
    print(zone)
    ope_data = log_data[log_data["zone"] == zone].sort_values(by=["timestep"])
    ope_data = log_data[:ts_end]
    ipw = InverseProbabilityWeighting(ope_data, retain_grad_fn=False, univariate_action=True)
    for policy in tqdm(policy_list):
        agent = PPO(6, 1, 0.003, 0.0005, 1, 10, 0.2,
                    has_continuous_action_space=True, action_std_init=0.2, 
                    device=torch.device('cpu'), diverse_policies=list(),
                    diverse_weight=0, diverse_increase=True)
        agent.load(policy)
        agent.policy_evaluation = True
        agent.policy_old.set_action_std(0.1)
        if policy not in invalid_policies:
            score, _, _, _, _ = ipw.evaluate_policy(agent.select_action, behavior_model, score="mean")
        else:
            continue
        if policy not in policy_scores:
            policy_scores[policy] = {}
        if zone not in policy_scores[policy]:
            policy_scores[policy][zone] = score.item()

#### Saving Raw Policy Scores

In [None]:
with open(f"data/<name_of_building>/ipw/raw_scores/ipw_raw_scores_{num_days}_days.pkl", "wb+") as f:
    pickle.dump(policy_scores, f)

#### Generating and Saving Spearman Correlation (OPTIONAL)

This step is done to calculate how well the ranking method works. This requires the ground truth performance of all policies in the policy library on the new building. The ground truth data can only be generated via a brute-force method.

In [None]:
eval_data_loc = "../data/evaluation_report_20220820.csv"
df = pd.read_csv(eval_data_loc, header=None, names=["datetime","policy","zone","energy"])
spearman_corr = {}
for zone in zones:
    eval_df = df[df["zone"] == zone]

    for i_policy in invalid_policies:
        eval_df = eval_df[eval_df["policy"]!=i_policy[3:]]
    eval_df = eval_df.sort_values(by=["energy"])
    score_list = []
    for i, row in eval_df.iterrows():
        score_list.append(policy_scores[f"../{row['policy']}"][zone])

    eval_df["ope_scores"] = score_list
    correlation = spearmanr(eval_df["energy"].values, eval_df["ope_scores"].values)
    spearman_corr[zone] = correlation



with open(f"data/<name_of_building>/ipw/spearman_corr/ipw_spearman_corr_{num_days}_days.pkl", "wb+") as f:
    pickle.dump(spearman_corr, f)

### 2. Self Normalized Inverse Probability Weighting

In [None]:
from ope.sniw import SelfNormalizedInverseProbabilityWeighting
policy_scores_sniw = {}
for zone in zones:
    print(zone)
    ope_data = log_data[log_data["zone"] == zone].sort_values(by=["timestep"])
    ope_data = log_data[:ts_end]
    snipw = SelfNormalizedInverseProbabilityWeighting(ope_data)
    for policy in tqdm(policy_list):
        agent = PPO(6, 1, 0.003, 0.0005, 1, 10, 0.2,
                    has_continuous_action_space=True, action_std_init=0.2, 
                    device=torch.device('cpu'), diverse_policies=list(),
                    diverse_weight=0, diverse_increase=True)
        agent.load(policy)
        agent.policy_evaluation = True
        agent.policy_old.set_action_std(0.1)
        if policy not in invalid_policies:
            score = snipw.evaluate_policy(agent.select_action, behavior_model, score="mean")
        else:
            continue
        if policy not in policy_scores_sniw:
            policy_scores_sniw[policy] = {}
        if zone not in policy_scores_sniw[policy]:
            policy_scores_sniw[policy][zone] = score.item()

#### Saving Raw Policy Scores

In [None]:
with open(f"data/<name_of_building>/snipw/raw_scores/snipw_raw_scores_{num_days}_days.pkl", "wb+") as f:
    pickle.dump(policy_scores_sniw, f)

#### Generating and Saving Spearman Correlation (OPTIONAL)

This step is done to calculate how well the ranking method works. This requires the ground truth performance of all policies in the policy library on the new building. The ground truth data can only be generated via a brute-force method.

In [None]:
eval_data_loc = "../data/eval_data/evaluation_report_20220820.csv"
df = pd.read_csv(eval_data_loc, header=None, names=["datetime","policy","zone","energy"])
spearman_corr_sniw = {}
for zone in zones:
    eval_df = df[df["zone"] == zone]
    for i_policy in invalid_policies:
        eval_df = eval_df[eval_df["policy"]!=i_policy[3:]]
    eval_df = eval_df.sort_values(by=["energy"])
    score_list = []
    for i, row in eval_df.iterrows():
        score_list.append(policy_scores_sniw[f"../{row['policy']}"][zone])

    eval_df["ope_scores"] = score_list
    correlation = spearmanr(eval_df["energy"].values, eval_df["ope_scores"].values, nan_policy="omit")
    spearman_corr_sniw[zone] = correlation

with open(f"data/15zone/snipw/spearman_corr/snipw_spearman_corr_{num_days}_days_19_09_2022.pkl", "wb+") as f:
    pickle.dump(spearman_corr_sniw, f)