In [5]:
import numpy as np
import pandas as pd
from tqdm import tqdm

In [6]:
no_rl = pd.read_csv('rl_results/no_rl/results.csv')

In [40]:
import pickle
with open('dataset_new.pkl', 'rb') as f:
    dataset = pickle.load(f)
dataset = dataset[:5000]

In [7]:
from features import *
arms = {
    'color': color_features,
    'color_pos': color_pos_features,
    'mean_pool': mean_pool,
    'filters': filters_33, 
    'deep': deep_contrastive,
}

USING MPS
USING MPS


In [None]:
# Random choice
def random_choice(seed=0):
    np.random.seed(seed)
    no_rl_shuffled = no_rl.sample(frac=1, random_state=seed).reset_index()

    rewards = []
    for t in range(1000):
        row = no_rl_shuffled.iloc[t]

        arm_name = np.random.choice(list(arms.keys()))
        rewards.append(row[arm_name + '_acc'])
    return rewards

random_choice_results = []
for i in tqdm(range(2000)):
    random_choice_results.append(random_choice(i))
random_choice_results = np.array(random_choice_results)

print(random_choice_results.mean())  # 0.722528479320239
np.save('rl_results/random_choice.npy', random_choice_results)

100%|██████████| 2000/2000 [00:41<00:00, 48.18it/s]


0.7115522361452927


In [15]:
# Best choice
always_best = []
for i in tqdm(range(2000)):
    rewards = []
    np.random.seed(i)
    no_rl_shuffled = no_rl.sample(frac=1, random_state=i).reset_index()
    for t in range(1000):
        rewards.append(max(no_rl_shuffled.iloc[t][[arm + '_acc' for arm in arms]]))
    always_best.append(rewards)
always_best = np.array(always_best)
print(always_best.mean())
np.save('rl_results/always_best.npy', always_best)

100%|██████████| 2000/2000 [02:53<00:00, 11.53it/s]

0.7805464472814061





In [16]:
# Always color
always_color = []
for i in tqdm(range(2000)):
    rewards = []
    np.random.seed(i)
    no_rl_shuffled = no_rl.sample(frac=1, random_state=i).reset_index()
    for t in range(1000):
        rewards.append(no_rl_shuffled.iloc[t].color_acc)
    always_color.append(rewards)

always_color = np.array(always_color)
print(always_color.mean())  # 0.694774764160312
np.save('rl_results/always_color.npy', always_color)

100%|██████████| 2000/2000 [00:30<00:00, 65.14it/s]

0.694774764160312





In [12]:
# Always deep
always_deep = []
for i in tqdm(range(2000)):
    rewards = []
    np.random.seed(i)
    no_rl_shuffled = no_rl.sample(frac=1, random_state=i).reset_index()
    for t in range(1000):
        rewards.append(no_rl_shuffled.iloc[t].deep_acc)
    always_deep.append(rewards)

always_deep = np.array(always_deep)
print(always_deep.mean())  # 0.7261494640049369
np.save('rl_results/always_deep.npy', always_deep)

100%|██████████| 2000/2000 [00:30<00:00, 64.79it/s]


0.7261494640049369


In [8]:
# One choice
(
    no_rl[['color_acc', 'color_pos_acc', 'mean_pool_acc', 'filters_acc', 'deep_acc']].mean(),
    no_rl[['color_acc', 'color_pos_acc', 'mean_pool_acc', 'filters_acc', 'deep_acc']].std()
)

(color_acc        0.694798
 color_pos_acc    0.721632
 mean_pool_acc    0.725385
 filters_acc      0.689901
 deep_acc         0.726060
 dtype: float64,
 color_acc        0.089748
 color_pos_acc    0.093961
 mean_pool_acc    0.094908
 filters_acc      0.089512
 deep_acc         0.089490
 dtype: float64)

In [None]:
# e-greedy
def e_greedy(eps=0.1, seed=0, optimistic_init=False):
    np.random.seed(seed)
    no_rl_shuffled = no_rl.sample(frac=1, random_state=seed).reset_index()
    
    counts = {arm: 0.0 for arm in arms}
    emp_mean = {arm: 1.0 if optimistic_init else 0.0 for arm in arms}

    rewards = []
    for t in range(1000):
        row = no_rl_shuffled.iloc[t]

        # if eps == 0:
        pulled_arm_name = None
        for arm in arms:
            if counts[arm] == 0:
                pulled_arm_name = arm
                break
        if pulled_arm_name is None:
            greedy_arm = np.array(list(emp_mean.values())).argmax()
            if np.random.rand() < eps:
                pulled_arm = np.random.choice([i for i in range(len(arms)) if i != greedy_arm])
            else:
                pulled_arm = greedy_arm
            pulled_arm_name = list(arms.keys())[pulled_arm]

        # Observe reward.
        reward = row[pulled_arm_name + '_acc']
        
        # Update counts.
        counts[pulled_arm_name] += 1
        # if eps != 0 or t < 5:
        emp_mean[pulled_arm_name] = (
            emp_mean[pulled_arm_name] + (1 / counts[pulled_arm_name]) * (reward - emp_mean[pulled_arm_name])
        )

        rewards.append(reward)
    return rewards

# e_greedy_results = []
# for i in tqdm(range(2000)):
#     e_greedy_results.append(e_greedy(eps=0, seed=i))
# e_greedy_results = np.array(e_greedy_results)
# print(e_greedy_results.mean())  # 0.694774764160312
# np.save('./rl_results/greedy.npy', e_greedy_results)

# e_greedy_results = []
# for i in tqdm(range(2000)):
#     e_greedy_results.append(e_greedy(eps=0.1, seed=i))
# e_greedy_results = np.array(e_greedy_results)
# print(e_greedy_results.mean())  # 0.7218202515089509
# np.save('./rl_results/e_greedy_01.npy', e_greedy_results)

# e_greedy_results = []
# for i in tqdm(range(2000)):
#     e_greedy_results.append(e_greedy(eps=0.2, seed=i))
# e_greedy_results = np.array(e_greedy_results)
# print(e_greedy_results.mean())  # 0.7208844949236448
# np.save('./rl_results/e_greedy_02.npy', e_greedy_results)

# e_greedy_results = []
# for i in tqdm(range(2000)):
#     e_greedy_results.append(e_greedy(eps=0.1, seed=i, optimistic_init=True))
# e_greedy_results = np.array(e_greedy_results)
# print(e_greedy_results.mean())  # 0.722528479320239
# np.save('./rl_results/e_greedy_01_opt.npy', e_greedy_results)

e_greedy_results = []
for i in tqdm(range(2000)):
    e_greedy_results.append(e_greedy(eps=0.0, seed=i, optimistic_init=True))
e_greedy_results = np.array(e_greedy_results)
print(e_greedy_results.mean())  # 0.7212095437359041
np.save('./rl_results/greedy_opt.npy', e_greedy_results)

In [18]:
# UCB
def ucb(seed=0, optimistic_init=False, c=2):
    np.random.seed(seed)
    no_rl_shuffled = no_rl.sample(frac=1, random_state=seed).reset_index()
    
    counts = {arm: 0.0 for arm in arms}
    emp_mean = {arm: 1.0 if optimistic_init else 0.0 for arm in arms}

    rewards = []
    for t in range(1000):
        row = no_rl_shuffled.iloc[t]

        ucb_values = []
        pulled_arm_name = None
        for arm in arms:
            if counts[arm] == 0:
                pulled_arm_name = arm
                break
            bound_bonus = np.sqrt((c * np.log(t+1)) / counts[arm])
            ucb_values.append(emp_mean[arm] + bound_bonus)
        if pulled_arm_name is None:
            pulled_arm = np.array(ucb_values).argmax()
            pulled_arm_name = list(arms.keys())[pulled_arm]

        # Observe reward.
        reward = row[pulled_arm_name + '_acc']
        
        # Update counts.
        counts[pulled_arm_name] += 1
        emp_mean[pulled_arm_name] = (
            ((counts[pulled_arm_name] - 1) / counts[pulled_arm_name]) * emp_mean[pulled_arm_name]
            + reward / counts[pulled_arm_name]
        )

        rewards.append(reward)
    return rewards

# ucb_results = []
# for i in tqdm(range(2000)):
#     ucb_results.append(ucb(i))
# ucb_results = np.array(ucb_results)
# print(ucb_results.mean())  # 0.7133616506682934
# np.save('./rl_results/ucb.npy', ucb_results)

# ucb_results = []
# for i in tqdm(range(2000)):
#     ucb_results.append(ucb(i, optimistic_init=True))
# ucb_results = np.array(ucb_results)
# print(ucb_results.mean())  # 0.7133616506682938
# np.save('./rl_results/ucb_optimistic.npy', ucb_results)

ucb_results = []
for i in tqdm(range(2000)):
    ucb_results.append(ucb(i, c=0.5))
ucb_results = np.array(ucb_results)
print(ucb_results.mean())  # 0.7133616506682934
np.save('./rl_results/ucb_smaller_bound.npy', ucb_results)

100%|██████████| 2000/2000 [00:44<00:00, 44.86it/s]

0.7149872201713673





In [11]:
# # PCA embeddings
# from sklearn.decomposition import PCA
# embeddings = []
# for i in tqdm(range(5000)):
#     embeddings.append(np.load(f'./embeddings/embedding_{i}.npy'))
# embeddings = np.stack(embeddings, axis=0)
# pca = PCA(n_components=64)
# print('Before', embeddings.shape)
# embeddings = pca.fit_transform(embeddings)
# print('After', embeddings.shape)
# np.save('./embeddings/pca_embeddings.npy', embeddings)

In [39]:
from tqdm import tqdm
from features import *

pca_embeddings = np.load('./embeddings/pca_embeddings.npy')
pca_embeddings = 1 / (1 + np.exp(pca_embeddings))
def linucb(seed=0):
    np.random.seed(seed)
    no_rl_shuffled = no_rl.sample(frac=1, random_state=seed).reset_index()
    
    A_arms = {arm: np.identity(64) for arm in arms}
    b_arms = {arm: np.zeros(64) for arm in arms}

    alpha = 0.1
    rewards = []
    for t in range(1000):
        row = no_rl_shuffled.iloc[t]

        img_feature = pca_embeddings[row.img_index]
        probs = []
        for arm in arms:
            inv_A = np.linalg.inv(A_arms[arm])
            theta = inv_A @ b_arms[arm]
            p = theta.T @ img_feature + alpha * np.sqrt(img_feature.T @ inv_A @ img_feature)
            probs.append(p)
        pulled_arm = np.array(probs).argmax()
        pulled_arm_name = list(arms.keys())[pulled_arm]

        # Observe reward.
        reward = row[pulled_arm_name + '_acc']
        
        # Update weights.
        A_arms[pulled_arm_name] = A_arms[pulled_arm_name] + np.outer(img_feature, img_feature)
        b_arms[pulled_arm_name] = b_arms[pulled_arm_name] + reward * img_feature

        rewards.append(reward)
    return rewards

linucb_results = []
for i in tqdm(range(2000)):
    linucb_results.append(linucb(i))
linucb_results = np.array(linucb_results)
print(linucb_results.mean())
np.save('./rl_results/linucb.npy', linucb_results)

  pca_embeddings = 1 / (1 + np.exp(pca_embeddings))
