# K-Means Clustering for Macro-Actions

### Imports

In [1]:
import os
from datetime import datetime

import minerl
import gym
import numpy as np
import tqdm
from minerl.data import BufferedBatchIter
from sklearn.cluster import KMeans




### Creating Data Dir (Local)

In [2]:
data_path = os.path.join(os.getcwd(), "data")

if not os.path.exists(data_path):
    os.mkdir(data_path)

os.environ['MINERL_DATA_ROOT'] = data_path # Important

### Globals

In [3]:
OBF_ENVS = ['MineRLTreechopVectorObf-v0', "MineRLObtainDiamondVectorObf-v0"] # Options for user
ENVIRONMENT = 'MineRLTreechopVectorObf-v0'

NUM_CLUSTERS = 10 # Number of Macro Actions we want to extract
CHAIN_LEN = 16 

NUM_BATCHES = 1000
MAX_ACTIONS = 100000
NUM_EPOCHS = 2
BATCH_SIZE = 32
ACTION_SIZE = 64


#### Data Download

In [4]:
# Downloading environment data if not exists
env_data_path = os.path.join(data_path, ENVIRONMENT)
if not os.path.exists(env_data_path):
    minerl.data.download(data_path, environment = ENVIRONMENT) # Careful

### Main
Samples the dataset storing `NUM_BATCHES` batches of actions. Then performs KMeans clustering to 
find `NUM_CLUSTERS` macro actions that represent reasonable actions for our agent to take. 

In [5]:
data = minerl.data.make(environment = ENVIRONMENT)

# Load the dataset storing NUM_BATCHES batches of actions
act_vectors = []
for _, act, _, _,_ in tqdm.tqdm(data.batch_iter(batch_size=BATCH_SIZE, seq_len=CHAIN_LEN, num_epochs=NUM_EPOCHS, preload_buffer_size=20)):
    act_vectors.append(act['vector'])
    if len(act_vectors) > NUM_BATCHES:
        break # Are we biased to the start of the actions?

print(np.array(act_vectors).shape)

# Reshape these the action batches
acts = np.concatenate(act_vectors).reshape(-1, ACTION_SIZE) 
kmeans_acts = acts[:MAX_ACTIONS]

# Use sklearn to cluster the demonstrated actions
kmeans = KMeans(n_clusters=NUM_CLUSTERS, random_state=0).fit(kmeans_acts)

1000it [00:28, 35.55it/s]


(1001, 32, 16, 64)


In [6]:
# Resultant array of n actions
kmeans.cluster_centers_.shape

(10, 64)

In [7]:
kmeans.cluster_centers_[0]

array([-0.08597003,  0.45087818, -0.0276162 ,  0.20232925, -0.055964  ,
        0.0865974 , -0.0216571 ,  0.25072173, -0.31427016, -0.48646174,
        0.03718235, -0.13574366,  0.16456375,  0.27888522, -0.40782313,
       -0.00998286,  0.48131185,  0.45898704,  0.27285716, -0.02326634,
        0.74651659,  0.25825674, -0.31973654,  0.39955536,  0.11930499,
       -0.08074889, -0.18800821, -0.27424003, -0.42986746, -0.21023621,
        0.47781847, -0.1571028 , -0.72401725,  0.15643016, -0.68436443,
        0.32466521, -0.45244323,  0.13704442,  0.53136888,  0.3897049 ,
       -0.34872101,  0.61415974,  0.11200521,  0.0611415 , -0.05284692,
        0.25141653,  0.4787411 ,  0.27491028,  0.56681445,  0.42759532,
        0.01455613,  0.42124615, -0.10184627, -0.3888384 ,  0.18160532,
       -0.43956815,  0.54734265, -0.4005452 ,  0.08690742, -0.0894084 ,
        0.13699018, -0.59398699, -0.06636539, -0.06676099])

In [8]:
i, net_reward, done, env = 0, 0, False, gym.make(ENVIRONMENT)
obs = env.reset()

while not done:
    # Let's use a frame skip of 4 (could you do better than a hard-coded frame skip?)
    if i % 4 == 0:
        action = {
            'vector': kmeans.cluster_centers_[np.random.choice(NUM_CLUSTERS)]
        }

    obs, reward, done, info = env.step(action)
    env.render()

    if reward > 0:
        print("+{} reward!".format(reward))
    net_reward += reward
    i += 1

print("Total reward: ", net_reward)

# i, net_reward, done, env = 0, 0, False, gym.make(ENVIRONMENT)
# max = 2000

# for idx, cluster in enumerate(kmeans.cluster_centers_):
#     obs = env.reset()

#     action = {'vector': cluster}
#     net_rew = 0
#     t_count = 0

#     while not done or t_count < max:
#         _, rew, done, _ = env.step(action)
#         net_rew += rew
#         t_count += 1
#         env.render()
#         if done:
#             print("EPISODE DONE!")
#             break
    
#     print(f"Action {idx+1} reward: {net_rew}")

# env.close()

0it [00:00, ?it/s]

Total reward:  0.0


In [9]:
# # Sampling a random action from our n actions
# # kmeans.cluster_centers_[np.random.choice(NUM_CLUSTERS)]

# # Save action set
# date_suffix = datetime.now().strftime('%m%d%M')
# filename = f"data/action_sets/action_set_{ENVIRONMENT}_{NUM_CLUSTERS}_{date_suffix}.npy"
# np.save(filename, kmeans.cluster_centers_)

# # Load action set
# # np.load(filename, kmeans.cluster_centers_)


: 