In [1]:
import numpy as np
import sys
import matplotlib.pyplot as plt
import pandas as pd
import pickle
sys.path.insert(0, "../")
import torch
from ppo import PPO
import glob
from tqdm.notebook import tqdm
import json
from scipy.stats import spearmanr
import importlib
from obp.ope import (
    ContinuousOffPolicyEvaluation,
    KernelizedInverseProbabilityWeighting,
    KernelizedSelfNormalizedInverseProbabilityWeighting,\
)
import wandb

%matplotlib widget

In [2]:
log_data = pd.read_csv("../data/rule_based_log_data/0_cleaned_log.csv")
with open("../data/rule_based_log_data/action_probs_all_data.pkl", "rb") as f:
    behavior_model = pickle.load(f)
with open("../data/invalid_policy_list_20220705.json") as f:
    invalid_policies = json.load(f)["invalid_policies"]

In [3]:
num_ts_per_day = 4 * 24
num_days = 30
ts_end = num_ts_per_day * num_days
zones = log_data["zone"].unique()

In [4]:
policy_list = sorted(list(glob.glob(f"../policy_library_20220705/**.pth")))

In [13]:
def get_policy_scores(config, use_progress_bar=False):
    policy_scores = {}
    kernel = config["kernel"]
    bandwidth = config["bandwidth"]
    for zone in zones:
        # print(zone)
        ope_data = log_data[log_data["zone"] == zone].sort_values(by=["timestep"])
        ope_data = log_data[:ts_end]
        states = []
        actions = []
        rewards = []
        for i, row in ope_data.iterrows():
            state_vars = ["outdoor_temp", "solar_irradiation", "time_hour",
                          "zone_humidity", "zone_temp", "zone_occupancy"]
            state = [row[var] for var in state_vars]
            action = row["action"]
            reward = row["reward"]
            states.append(state)
            rewards.append(reward)
            actions.append(action)
        ope = ContinuousOffPolicyEvaluation(bandit_feedback=
                                            {"action": np.array(actions),
                                             "reward": np.array(rewards),
                                             "pscore": np.ones((len(ope_data)))},
                                            ope_estimators=[KernelizedInverseProbabilityWeighting(kernel=kernel, bandwidth=bandwidth)])
        
        if use_progress_bar:
            policy_iterable = tqdm(policy_list)
        else:
            policy_iterable = policy_list
        for policy in policy_iterable:
            agent = PPO(6, 1, 0.003, 0.0005, 1, 10, 0.2,
                        has_continuous_action_space=True, action_std_init=0.2, 
                        device=torch.device('cpu'), diverse_policies=list(),
                        diverse_weight=0, diverse_increase=True)
            agent.load(policy)
            agent.policy_evaluation = False
            agent.policy_old.set_action_std(0.1)

            # probs = torch.exp(agent.buffer.logprobs[0].reshape(-1, 1))
            if policy not in invalid_policies:
                # score, _, _, _, _ = ipw.evaluate_policy(agent.select_action, behavior_model, score="mean")
                eval_actions = torch.Tensor(agent.select_action(states)).sigmoid()
                estimated_value = ope.estimate_policy_values(action_by_evaluation_policy=eval_actions.numpy())
                # print(estimated_value)
            else:
                continue
            if policy not in policy_scores:
                policy_scores[policy] = {}
            if zone not in policy_scores[policy]:
                policy_scores[policy][zone] = estimated_value["kernelized_ipw"]
    return policy_scores

In [14]:
def calculate_zonewise_spearman_corr(policy_scores):
    eval_data_loc = "../data/evaluation_clean_20220705.csv"
    df = pd.read_csv(eval_data_loc, header=None, names=["datetime","policy","zone","energy"])
    spearman_corr = {}
    for zone in zones:
        eval_df = df[df["zone"] == zone]

        # invalid_policies = list(set(policy_list) - set(policy_scores.keys()))
        for i_policy in invalid_policies:
            eval_df = eval_df[eval_df["policy"]!=i_policy[3:]]
        eval_df = eval_df.sort_values(by=["energy"])
        score_list = []
        for i, row in eval_df.iterrows():
            score_list.append(policy_scores[f"../{row['policy']}"][zone])

        eval_df["ope_scores"] = score_list
        correlation = spearmanr(eval_df["energy"].values, eval_df["ope_scores"].values, nan_policy="omit")
        spearman_corr[zone] = correlation
    return spearman_corr

In [15]:
def calculate_average_spearman_corr(config):
    policy_scores = get_policy_scores(config)
    spearman_corr = calculate_zonewise_spearman_corr(policy_scores)
    corrs = []
    for zone in spearman_corr:
        corrs.append(abs(spearman_corr[zone].correlation))
    return np.mean(corrs)

In [16]:
test_config = {
    "kernel": "gaussian",
    "bandwidth": 0.02
}
calculate_average_spearman_corr(test_config)

0.5780736422596887

## Wandb Hyperparameter Sweep

In [17]:
def evaluate_hyperparams(config=None):
    with wandb.init(config=config):
        config = wandb.config
        avg_spearman_corr = calculate_average_spearman_corr(config)
        wandb.log({"average_spearman_correlation": avg_spearman_corr})   

In [18]:
project = "ContinuousOPEHyperParamTuning_28_07_2022"

wandb.init(project=project)

sweep_config = {
    "method": "random",
    "metric": {
        "name": "average_spearman_correlation",
        "goal": "maximize"
    },
    "parameters": {
        "kernel": {
            "values": ["gaussian", "epanechnikov", "triangular", "cosine"]
        },
        "bandwidth": {
            "distribution": "uniform",
            "min": 0,
            "max": 0.5
        }
    }
}

sweep_id = wandb.sweep(sweep_config, project=project)

[34m[1mwandb[0m: Currently logged in as: [33maakashsasikumar[0m. Use [1m`wandb login --relogin`[0m to force relogin


Create sweep with ID: qcxa15qo
Sweep URL: https://wandb.ai/aakashsasikumar/ContinuousOPEHyperParamTuning_28_07_2022/sweeps/qcxa15qo


In [None]:
wandb.agent(sweep_id, evaluate_hyperparams, count=100)

[34m[1mwandb[0m: Agent Starting Run: cz9bnc8i with config:
[34m[1mwandb[0m: 	bandwidth: 0.15177222212586255
[34m[1mwandb[0m: 	kernel: cosine


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
average_spearman_correlation,▁

0,1
average_spearman_correlation,0.70286


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: w1viky0n with config:
[34m[1mwandb[0m: 	bandwidth: 0.10137394054261668
[34m[1mwandb[0m: 	kernel: epanechnikov


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
average_spearman_correlation,▁

0,1
average_spearman_correlation,0.60807


[34m[1mwandb[0m: Agent Starting Run: zs7l3d5n with config:
[34m[1mwandb[0m: 	bandwidth: 0.45405500840241136
[34m[1mwandb[0m: 	kernel: triangular


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
average_spearman_correlation,▁

0,1
average_spearman_correlation,0.9113


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 62svd66c with config:
[34m[1mwandb[0m: 	bandwidth: 0.09788721614128798
[34m[1mwandb[0m: 	kernel: triangular


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
average_spearman_correlation,▁

0,1
average_spearman_correlation,0.5851


[34m[1mwandb[0m: Agent Starting Run: 60q6ydj7 with config:
[34m[1mwandb[0m: 	bandwidth: 0.07327809698196275
[34m[1mwandb[0m: 	kernel: cosine


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
average_spearman_correlation,▁

0,1
average_spearman_correlation,0.50493


[34m[1mwandb[0m: Agent Starting Run: 712salzg with config:
[34m[1mwandb[0m: 	bandwidth: 0.03095698180523787
[34m[1mwandb[0m: 	kernel: gaussian


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
average_spearman_correlation,▁

0,1
average_spearman_correlation,0.60475


[34m[1mwandb[0m: Agent Starting Run: c7k2zp2r with config:
[34m[1mwandb[0m: 	bandwidth: 0.2488191943951622
[34m[1mwandb[0m: 	kernel: epanechnikov


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
average_spearman_correlation,▁

0,1
average_spearman_correlation,0.8338


[34m[1mwandb[0m: Agent Starting Run: bw6am8pd with config:
[34m[1mwandb[0m: 	bandwidth: 0.30325464947726516
[34m[1mwandb[0m: 	kernel: epanechnikov


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
average_spearman_correlation,▁

0,1
average_spearman_correlation,0.88411


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: 7jyaetgl with config:
[34m[1mwandb[0m: 	bandwidth: 0.30407785182471414
[34m[1mwandb[0m: 	kernel: gaussian
