In [None]:
!pip install gym[toy_text]

In [None]:
from collections import deque
import pandas as pd
import numpy as np

class Save:
    def __init__(self) -> None:
        self.buffer = deque()

    def add(
        self,
        id : int,
        state: np.array,
        action: int,
        reward: float,
        next_state: np.array,
        prob: float,
        done : int
    ) -> None:

        data = (id, state, action, reward, next_state, prob, done)
        self.buffer.append(data)

    def __len__(self):
        return len(self.buffer)

    def get_df(self, columns):

        stacked = np.stack([x for x in save.buffer])
        return pd.DataFrame(stacked, columns=columns)

In [None]:
import gym
import numpy as np

ENV_NAME = 'Acrobot-v1'
TRIAL_LEN = 10000
P = [0.15, 0.1, 0.75]
ID = 1
COLUMNS = ['ID', 'State', 'Action', 'Reward', 'Next_state', 'Prob', 'is_done']

save = Save()

env = gym.make(ENV_NAME)
observation = env.reset()
for time in range(TRIAL_LEN):
    
    action = np.random.choice(len(P), 1, p=P)

    next_observation, reward, done, info = env.step(action[0])
    save.add(ID, observation, action[0], reward, next_observation, P[action[0]], done)
    observation = next_observation

env.close()

In [None]:
log = save.get_df(COLUMNS)

In [None]:
def add_pred_col(model, test_X, test_dataset):
    test_probs = model.predict_proba(test_X)

    pred_actions = np.argmax(test_probs, 1)
    pred_p = np.max(test_probs, 1)

    cp_test = test_dataset.copy()
    cp_test['pred_action'] = pred_actions
    cp_test['pred_p'] = pred_p

    return cp_test

In [None]:
def behavior_policy(mean_probs):
    return softmax(mean_probs)

In [None]:
def get_probs(test_dataset):

    read_and_pred_P = np.zeros((2, len(test_dataset.Action.unique())))
    for action in test_dataset.Action.unique():
        action = int(action)
        real_p = np.mean(test_dataset[test_dataset.Action == action].Prob)
        pred_p = np.mean(test_dataset[test_dataset.pred_action == action].pred_p)
            
        read_and_pred_P[0, action] = real_p
        read_and_pred_P[1, action] = pred_p
    return read_and_pred_P

In [None]:
def softmax(x):
    f_x = np.exp(x) / np.sum(np.exp(x))
    return f_x

def KL_divergence(read_and_pred_P):
    n = read_and_pred_P[0, :]
    d = softmax(read_and_pred_P[1, :])
    return np.sum(n * np.log(n/d))

In [None]:
def data_split(dataset, offset_rate):
    dataset = log
    test_border = int(len(dataset) * offset_rate)
    train_dataset = dataset.iloc[:test_border, :]
    test_dataset = dataset.iloc[test_border:, :]
    return train_dataset, test_dataset

In [None]:
def process(model, train_X, train_Y, test_X, test_dataset, model_name):
    model.fit(train_X, train_Y)
    test_dataset = add_pred_col(model, test_X, test_dataset)
    read_and_pred_P = get_probs(test_dataset)

    bp = behavior_policy(read_and_pred_P[1, :])
    kl = KL_divergence(read_and_pred_P)

    print('{} prob before correction : '.format(model_name), read_and_pred_P[1, :])
    print('{} behavior policy : '.format(model_name), bp)    
    print('{} kl : '.format(model_name), kl)  
    return test_dataset 

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

kn = KNeighborsClassifier(n_neighbors=len(log.Action.unique()))
rf = RandomForestClassifier()
mlp = MLPClassifier()

train_dataset, test_dataset = data_split(log, offset_rate=0.8)
train_X = np.stack([x for x in train_dataset.State])
train_Y = np.array(train_dataset.Action).astype('int')
test_X = np.stack([x for x in test_dataset.State])
test_Y = np.array(test_dataset.Action).astype('int')

test_dataset = process(kn, train_X, train_Y, test_X, test_dataset, 'KNN')
test_dataset = process(rf, train_X, train_Y, test_X, test_dataset, 'rf')
test_dataset = process(rf, train_X, train_Y, test_X, test_dataset, 'mlp')

In [None]:
kn.fit(train_X, train_Y)
kn.score(test_X, test_Y)

In [None]:
kn = KNeighborsClassifier(n_neighbors=len(log.Action.unique()))
rf = RandomForestClassifier()
mlp = MLPClassifier()

train_dataset, test_dataset = data_split(log, offset_rate=0.8)
train_X = np.hstack((
    np.stack([x for x in train_dataset.State]),
    np.stack([x for x in train_dataset.Reward]).reshape(-1, 1),
    np.stack([x for x in train_dataset.Next_state]),
    np.stack([x for x in train_dataset.is_done]).reshape(-1, 1),
           ))
train_Y = np.array(train_dataset.Action).astype('int')
test_X = np.hstack((
    np.stack([x for x in test_dataset.State]),
    np.stack([x for x in test_dataset.Reward]).reshape(-1, 1),
    np.stack([x for x in test_dataset.Next_state]),
    np.stack([x for x in test_dataset.is_done]).reshape(-1, 1),
           ))
test_Y = np.array(test_dataset.Action).astype('int')

test_dataset = process(kn, train_X, train_Y, test_X, test_dataset, 'KNN')
test_dataset = process(rf, train_X, train_Y, test_X, test_dataset, 'rf')
test_dataset = process(rf, train_X, train_Y, test_X, test_dataset, 'mlp')

In [None]:
test_dataset['State'].iloc[0]

In [None]:
kn.fit(train_X, train_Y)
kn.score(test_X, test_Y)