# Continuous Offline Policy Evaluation

Testing continuous offline policy evaluation

In [None]:
import numpy as np
import sys
import pandas as pd
import pickle
sys.path.insert(0, "../")
import torch
from ppo import PPO
import glob
from tqdm.notebook import tqdm
from scipy.stats import spearmanr
from obp.ope import (
    ContinuousOffPolicyEvaluation,
    KernelizedInverseProbabilityWeighting,
)
%matplotlib widget

In [None]:
log_data = pd.read_csv("../data/rule_based_log_data/<name_of_building>/log_data.csv")
with open("../data/rule_based_log_data/<name_of_building>/action_probs_all_data.pkl", "rb") as f:
    behavior_model = pickle.load(f)

In [None]:
num_ts_per_day = 4 * 24
num_days = 15  # Number of days to consider for evaluation
ts_end = num_ts_per_day * num_days
zones = log_data["zone"].unique()

In [None]:
policy_list = sorted(list(glob.glob(f"../policy_library_20220705/**.pth")))
invalid_policies = []

In [None]:
def get_policy_scores(config, use_progress_bar=False):
    policy_scores = {}
    kernel = config["kernel"]
    bandwidth = config["bandwidth"]
    for zone in zones:
        # print(zone)
        ope_data = log_data[log_data["zone"] == zone].sort_values(by=["timestep"])
        ope_data = log_data[:ts_end]
        states = []
        actions = []
        rewards = []
        for i, row in ope_data.iterrows():
            state_vars = ["outdoor_temp", "solar_irradiation", "time_hour",
                          "zone_humidity", "zone_temp", "zone_occupancy"]
            state = [row[var] for var in state_vars]
            action = row["action"]
            reward = row["reward"]
            states.append(state)
            rewards.append(reward)
            actions.append(action)
        ope = ContinuousOffPolicyEvaluation(bandit_feedback=
                                            {"action": np.array(actions),
                                             "reward": np.array(rewards),
                                             "pscore": np.ones((len(ope_data)))},
                                            ope_estimators=[KernelizedInverseProbabilityWeighting(kernel=kernel, bandwidth=bandwidth)])
        
        if use_progress_bar:
            policy_iterable = tqdm(policy_list)
        else:
            policy_iterable = policy_list
        for policy in policy_iterable:
            agent = PPO(6, 1, 0.003, 0.0005, 1, 10, 0.2,
                        has_continuous_action_space=True, action_std_init=0.2, 
                        device=torch.device('cpu'), diverse_policies=list(),
                        diverse_weight=0, diverse_increase=True)
            agent.load(policy)
            agent.policy_evaluation = False
            agent.policy_old.set_action_std(0.1)

            if policy not in invalid_policies:
                eval_actions = torch.Tensor(agent.select_action(states)).sigmoid()
                estimated_value = ope.estimate_policy_values(action_by_evaluation_policy=eval_actions.numpy())
            else:
                continue
            if policy not in policy_scores:
                policy_scores[policy] = {}
            if zone not in policy_scores[policy]:
                policy_scores[policy][zone] = estimated_value["kernelized_ipw"]
    return policy_scores

In [None]:
def calculate_zonewise_spearman_corr(policy_scores):
    """Method to calculate the spearman correlation for this ranking method

    This method cannot be used unless the ground truth policy values are know. This method
    is not required to run OPE.

    Args:
        policy_scores (list): The list containing the locations of the policies

    Returns:
        dict: A dictionary containing the spearman correlation for each zone
    """
    eval_data_loc = "../data/evaluation_clean_20220705.csv"
    df = pd.read_csv(eval_data_loc, header=None, names=["datetime","policy","zone","energy"])
    spearman_corr = {}
    for zone in zones:
        eval_df = df[df["zone"] == zone]

        # invalid_policies = list(set(policy_list) - set(policy_scores.keys()))
        for i_policy in invalid_policies:
            eval_df = eval_df[eval_df["policy"]!=i_policy[3:]]
        eval_df = eval_df.sort_values(by=["energy"])
        score_list = []
        for i, row in eval_df.iterrows():
            score_list.append(policy_scores[f"../{row['policy']}"][zone])

        eval_df["ope_scores"] = score_list
        correlation = spearmanr(eval_df["energy"].values, eval_df["ope_scores"].values, nan_policy="omit")
        spearman_corr[zone] = correlation
    return spearman_corr

In [None]:
def calculate_average_spearman_corr(config):
    policy_scores = get_policy_scores(config)
    spearman_corr = calculate_zonewise_spearman_corr(policy_scores)
    corrs = []
    for zone in spearman_corr:
        corrs.append(abs(spearman_corr[zone].correlation))
    return np.mean(corrs)

In [None]:
config = {
    "kernel": "gaussian",
    "bandwidth": 0.02
}
policy_scores = get_policy_scores(config)
with open(f"data/<name_of_building>/cont_ipw/raw_scores/cont_ipw_raw_scores_days.pkl", "wb+") as f:
    pickle.dump(policy_scores, f)