In [1]:
import numpy as np
import pandas as pd
import cacb
from cacb.cacb import ContinuousActionContextualBanditModel
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

ModuleNotFoundError: No module named 'cacb'

## Policy evaluation

loss = mean abs difference between predicted cost and actual cost

1. train a candidate model with the logged data (exluding a holdout set) of the existing model

2. with both the current model and the candidate model, compute loss = abs(cost_pred - cost_true) for each of the examples in the holdout set

3. compute the mean of the losses for both models => choose the model average whose loss is smaller

In [10]:
def get_cost(action, context):
    if context[0] == 0:
        target = 5
    else:
        target = 8
    return (action - (target + np.random.random())) ** 2

def simulate(model, n=200, n_learn=100):
    for i in range(n):
        context = np.array([1])
        if i % 7 == 0:
            context = np.array([1])
        else:
            context = np.array([0])

        action, prob = model.predict(context, epsilon=0.08, exploration_width=1)
        cost = get_cost(action, context)

        if i < n_learn:
            model.learn(context, action, cost, prob)
        else:
            model._log_example(context, action, cost, prob)
            
def train(model, logged_data_df):
    for idx, row in logged_data_df.iterrows():
        context = row.drop(["action", "cost", "prob"]).values
        action = row.action
        cost = row.cost
        prob = row.prob
        model.learn(context, action, cost, prob)
        
def loss(model, logged_data_df):
    diff = []
    for idx, row in logged_data_df.iterrows():
        action = row.action
        context = row.drop(["action", "cost", "prob"]).values
        cost_predicted = model.get_costs_per_action(context)[action]
        cost_true = row.cost
        diff.append(cost_true - cost_predicted)
    return np.mean(np.abs(diff))

TRAIN_SAMPLES = 100
HOLDOUT_SAMPLES = 100

In [11]:
current_model = ContinuousActionContextualBanditModel(
    min_value=0,
    max_value=10,
    action_width=1,
    initial_action=0,
    regression_model=LinearRegression(),
    decay_rate=10
)
simulate(current_model, TRAIN_SAMPLES+HOLDOUT_SAMPLES, TRAIN_SAMPLES)

In [21]:
candidate_model = ContinuousActionContextualBanditModel(
    min_value=0,
    max_value=10,
    action_width=1,
    initial_action=0,
    regression_model=GradientBoostingRegressor(),
    decay_rate=10
)
train(candidate_model, current_model.get_logged_data_df().iloc[:TRAIN_SAMPLES])

In [22]:
# holdout loss current model
loss(current_model, current_model.get_logged_data_df().iloc[-HOLDOUT_SAMPLES:])

2.487911315536947

In [23]:
# holdout loss candidate model
loss(candidate_model, current_model.get_logged_data_df().iloc[-HOLDOUT_SAMPLES:])

2.2008710252038433